summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86')
-rw-r--r--third_party/dav1d/src/x86/cdef.h87
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx2.asm877
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx512.asm622
-rw-r--r--third_party/dav1d/src/x86/cdef16_sse.asm1033
-rw-r--r--third_party/dav1d/src/x86/cdef_avx2.asm1772
-rw-r--r--third_party/dav1d/src/x86/cdef_avx512.asm860
-rw-r--r--third_party/dav1d/src/x86/cdef_sse.asm1357
-rw-r--r--third_party/dav1d/src/x86/cpu.c100
-rw-r--r--third_party/dav1d/src/x86/cpu.h44
-rw-r--r--third_party/dav1d/src/x86/cpuid.asm55
-rw-r--r--third_party/dav1d/src/x86/filmgrain.h81
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx2.asm2248
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx512.asm932
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_sse.asm3421
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx2.asm2107
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx512.asm813
-rw-r--r--third_party/dav1d/src/x86/filmgrain_common.asm46
-rw-r--r--third_party/dav1d/src/x86/filmgrain_sse.asm3233
-rw-r--r--third_party/dav1d/src/x86/ipred.h151
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx2.asm4992
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx512.asm833
-rw-r--r--third_party/dav1d/src/x86/ipred16_sse.asm1923
-rw-r--r--third_party/dav1d/src/x86/ipred_avx2.asm5387
-rw-r--r--third_party/dav1d/src/x86/ipred_avx512.asm1432
-rw-r--r--third_party/dav1d/src/x86/ipred_sse.asm5409
-rw-r--r--third_party/dav1d/src/x86/itx.h363
-rw-r--r--third_party/dav1d/src/x86/itx16_avx2.asm8599
-rw-r--r--third_party/dav1d/src/x86/itx16_avx512.asm4133
-rw-r--r--third_party/dav1d/src/x86/itx16_sse.asm8135
-rw-r--r--third_party/dav1d/src/x86/itx_avx2.asm5542
-rw-r--r--third_party/dav1d/src/x86/itx_avx512.asm7389
-rw-r--r--third_party/dav1d/src/x86/itx_sse.asm6533
-rw-r--r--third_party/dav1d/src/x86/loopfilter.h66
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx2.asm1161
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx512.asm912
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_sse.asm1793
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx2.asm1569
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx512.asm1534
-rw-r--r--third_party/dav1d/src/x86/loopfilter_sse.asm2348
-rw-r--r--third_party/dav1d/src/x86/looprestoration.h94
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx2.asm2540
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx512.asm2524
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_sse.asm3723
-rw-r--r--third_party/dav1d/src/x86/looprestoration_avx2.asm2237
-rw-r--r--third_party/dav1d/src/x86/looprestoration_avx512.asm2122
-rw-r--r--third_party/dav1d/src/x86/looprestoration_sse.asm3681
-rw-r--r--third_party/dav1d/src/x86/mc.h299
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm5879
-rw-r--r--third_party/dav1d/src/x86/mc16_avx512.asm4858
-rw-r--r--third_party/dav1d/src/x86/mc16_sse.asm8731
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm5669
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm4538
-rw-r--r--third_party/dav1d/src/x86/mc_sse.asm9599
-rw-r--r--third_party/dav1d/src/x86/msac.asm667
-rw-r--r--third_party/dav1d/src/x86/msac.h75
-rw-r--r--third_party/dav1d/src/x86/refmvs.asm688
-rw-r--r--third_party/dav1d/src/x86/refmvs.h61
57 files changed, 147877 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef.h b/third_party/dav1d/src/x86/cdef.h
new file mode 100644
index 0000000000..553d650741
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#define decl_cdef_fns(ext) \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
+
+decl_cdef_fns(avx512icl);
+decl_cdef_fns(avx2);
+decl_cdef_fns(sse4);
+decl_cdef_fns(ssse3);
+decl_cdef_fns(sse2);
+
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
+
+static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+#if BITDEPTH == 8
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->dir = BF(dav1d_cdef_dir, ssse3);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+ c->dir = BF(dav1d_cdef_dir, sse4);
+#if BITDEPTH == 8
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->dir = BF(dav1d_cdef_dir, avx2);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm
new file mode 100644
index 0000000000..4c8d3bca43
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@@ -0,0 +1,877 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+%macro DIR_TABLE 1 ; stride
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+ db 0 * %1 + 2, 0 * %1 + 4
+ db 0 * %1 + 2, 1 * %1 + 4
+ db 1 * %1 + 2, 2 * %1 + 4
+ db 1 * %1 + 0, 2 * %1 + 2
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+%endmacro
+
+dir_table4: DIR_TABLE 16
+dir_table8: DIR_TABLE 32
+pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3
+
+dir_shift: times 2 dw 0x4000
+ times 2 dw 0x1000
+
+pw_2048: times 2 dw 2048
+pw_m16384: times 2 dw -16384
+
+cextern cdef_dir_8bpc_avx2.main
+
+SECTION .text
+
+%macro CDEF_FILTER 2 ; w, h
+ DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dird, r7m
+ vpbroadcastd m8, [base+pw_2048]
+ lea dirq, [base+dir_table%1+dirq*2]
+ test prid, prid
+ jz .sec_only
+%if WIN64
+ vpbroadcastw m6, prim
+ movaps [rsp+16*0], xmm9
+ movaps [rsp+16*1], xmm10
+%else
+ movd xm6, prid
+ vpbroadcastw m6, xm6
+%endif
+ lzcnt pridmpd, prid
+ rorx tmpd, prid, 2
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, tmpd ; pri >>= 2
+ mov tmpd, r8m ; damping
+ and prid, 4
+ sub tmpd, 31
+ vpbroadcastd m9, [base+pri_taps+priq+8*0]
+ vpbroadcastd m10, [base+pri_taps+priq+8*1]
+ test secd, secd
+ jz .pri_only
+%if WIN64
+ movaps r8m, xmm13
+ vpbroadcastw m13, secm
+ movaps r4m, xmm11
+ movaps r6m, xmm12
+%else
+ movd xm0, secd
+ vpbroadcastw m13, xm0
+%endif
+ lzcnt secd, secd
+ xor prid, prid
+ add pridmpd, tmpd
+ cmovs pridmpd, prid
+ add secd, tmpd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+ mov [sec_shift], secq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps xmm11, r4m
+ movaps xmm12, r6m
+ movaps xmm13, r8m
+%endif
+ jmp .pri_end
+.pri_only:
+ add pridmpd, tmpd
+ cmovs pridmpd, secd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.pri_end:
+%if WIN64
+ movaps xmm9, [rsp+16*0]
+ movaps xmm10, [rsp+16*1]
+%endif
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+%if WIN64
+ vpbroadcastw m6, secm
+%else
+ movd xm6, secd
+ vpbroadcastw m6, xm6
+%endif
+ tzcnt secd, secd
+ sub tmpd, secd
+ mov [sec_shift], tmpq
+ lea tmpq, [px]
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, m9 ; pri_tap_k0
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pmaxsw m11, m2, m3
+ pminuw m12, m2, m3
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m13, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m13, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ paddw m7, m2 ; constrain(diff_k0)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, m9 ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ pmaxsw m11, m1
+ pminuw m12, m1
+ paddw m0, m1
+ pminsw m0, m11
+ pmaxsw m0, m12
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+16*6
+ %define offq r8
+ %define pri_shift rsp+16*2
+ %define sec_shift rsp+16*3
+%else
+ %define px rsp+16*4
+ %define offq r4
+ %define pri_shift rsp+16*0
+ %define sec_shift rsp+16*1
+%endif
+ %define base r8-dir_table4
+ mov edged, r9m
+ lea r8, [dir_table4]
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ vpbroadcastd m7, [base+pw_m16384]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*4+0], xm0
+ mova [px+16*5+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*4+0], m7
+.bottom_no_left:
+ movd [px+16*4-4], xm7
+ movd [px+16*5-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+ mov edged, r9m
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ lea r6, [dstq+strideq*4]
+ movu xm4, [r6 +strideq*0]
+ movu xm5, [r6 +strideq*1]
+ movu xm6, [r6 +strideq*2]
+ movu xm7, [r6 +r9 ]
+ lea r8, [dir_table4]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ mova [px+16*4+0], xm4
+ mova [px+16*5+0], xm5
+ mova [px+16*6+0], xm6
+ mova [px+16*7+0], xm7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*8+0], xm0
+ mova [px+16*9+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*8-4], xm0
+ movd [px+16*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*8+0], m7
+.bottom_no_left:
+ movd [px+16*8-4], xm7
+ movd [px+16*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ movd [px+16*6-4], xm2
+ movd [px+16*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+32*4
+%else
+ %define px rsp+32*3
+%endif
+ %define base r8-dir_table8
+ mov edged, r9m
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movu m2, [r6 +strideq*0]
+ movu m3, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m4, [r6 +strideq*0]
+ movu m5, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m6, [r6 +strideq*0]
+ movu m7, [r6 +strideq*1]
+ lea r8, [dir_table8]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-32*2-4], xm0
+ movd [px-32*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], xm7
+ movd [px-32*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+32*8-4], xm0
+ movd [px+32*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], xm7
+ movd [px+32*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+32*0-4], xm0
+ movd [px+32*1-4], xm1
+ movd [px+32*2-4], xm2
+ movd [px+32*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+32*4-4], xm0
+ movd [px+32*5-4], xm1
+ movd [px+32*6-4], xm2
+ movd [px+32*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ vpbroadcastd m4, [r6+bdmaxq*4]
+ lea r6, [strideq*3]
+ mova xm0, [srcq+strideq*0]
+ mova xm1, [srcq+strideq*1]
+ mova xm2, [srcq+strideq*2]
+ mova xm3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+r6 ], 1
+ vinserti128 m1, [srcq+strideq*2], 1
+ vinserti128 m2, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*0], 1
+ REPX {pmulhuw x, m4}, m0, m1, m2, m3
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef16_avx512.asm b/third_party/dav1d/src/x86/cdef16_avx512.asm
new file mode 100644
index 0000000000..6d625a02a0
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx512.asm
@@ -0,0 +1,622 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21
+ db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29
+ db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37
+ db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
+end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
+ dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
+ dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
+pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4
+cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6
+ dw 1, 2, 1, 10, 9, 18, 8, 17
+ dw 8, 16, 8, 15, -7,-14, 1, -6
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28
+ db 2, 4, 2, 36, 34, 68, 32, 66
+ db 32, 64, 32, 62,-30,-60, 2,-28
+pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3
+sec_taps4: dw 32, 16
+pw_m16384: times 2 dw -16384
+pw_2048: times 2 dw 2048
+pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4)
+edge_mask8: dw 0x2121, 0x2020, 0x0101
+
+SECTION .text
+
+%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
+ psubw %1, %2, %3
+ pabsw %1, %1
+ vpcmpgtw k1, %3, %2
+ vpsrlvw %7, %1, %6
+ psubusw %7, %5, %7
+ pminsw %1, %7
+ vpsubw %1{k1}, %4, %1
+%endmacro
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs4
+ lea r6, [cdef_dirs4]
+ movu xm3, [dstq+strideq*0]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ mova xm2, [leftq]
+ lea r2, [dstq+strideq*2]
+ vinserti32x4 m3, [r2+strideq*0], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m3, [r2+strideq*1], 3
+ vpermt2d m2, m5, m3
+ vinserti32x4 m1, m2, [topq+strideq*0-4], 0
+ vinserti32x4 m1, [topq+strideq*1-4], 1
+ mov r3d, edgem
+ movifnidn prid, prim
+ punpcklwd m3, m3 ; px
+ psrlw m5, 8
+ vpbroadcastd m0, [base+pd_268435568]
+ pxor m12, m12
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m6, m3, m8
+ pmaxsw m7, m3, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ call .constrain_sec
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ psrldq m8, m6, 2
+ vpshldd m3, m0, 8
+ psrldq m9, m7, 2
+ paddd m0, m3
+ pminuw m6, m8
+ psrldq m0, 1
+ pmaxsw m7, m9
+ pmaxsw m0, m6
+ pminsw m0, m7
+ vpmovdw ym0, m0
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ mov r4d, dirm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym1, [base+end_perm4]
+ vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ vpermb m0, m1, m0
+.end:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm0, ym0, 1
+ movq [r2+strideq*0], xm0
+ movhps [r2+strideq*1], xm0
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ or r3d, 0x04
+ vmovdqa32 m1{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m8, m5, m9
+ vpermi2w m8, m1, m2 ; k0p0 k1p0
+ psubw m9, m5, m9
+ vpermi2w m9, m1, m2 ; k0p1 k1p1
+ CONSTRAIN m10, m8, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ CONSTRAIN m10, m9, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ ret
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+ lea r6, [cdef_dirs4]
+ movu xm18, [dstq+strideq*0]
+ vinserti128 ym18, [dstq+strideq*1], 1
+ mova xm1, [leftq+16*0]
+ mova xm2, [leftq+16*1]
+ lea r2, [strideq*3]
+ vinserti32x4 m18, [dstq+strideq*2], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m18, [dstq+r2 ], 3
+ vpermt2d m1, m5, m18
+ vinserti32x4 m0, m1, [topq+strideq*0-4], 0
+ vinserti32x4 m0, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu xm19, [r3+strideq*0]
+ vinserti128 ym19, [r3+strideq*1], 1
+ vinserti32x4 m19, [r3+strideq*2], 2
+ vinserti32x4 m19, [r3+r2 ], 3
+ mov r3d, edgem
+ movifnidn prid, prim
+ vpermt2d m2, m5, m19
+ vpbroadcastd m16, [base+pd_268435568]
+ pxor m12, m12
+ punpcklwd m18, m18 ; px (top)
+ psrlw m5, 8
+ punpcklwd m19, m19 ; px (bottom)
+ mova m17, m16
+ vshufi32x4 m1, m2, q3210
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m3, m18, m6
+ pmaxsw m4, m18, m6
+ pminuw m20, m19, m7
+ pmaxsw m21, m19, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ call .constrain_sec
+ pminuw m3, m6
+ pmaxsw m4, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m3, m6
+ pmaxsw m4, m6
+ mov r3, 0xcccccccccccccccc
+ pminuw m20, m7
+ pmaxsw m21, m7
+ kmovq k1, r3
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vbroadcasti32x4 m0, [base+deint_shuf]
+ vpshldd m6, m20, m3, 16
+ vmovdqu8 m3{k1}, m20
+ vpshldd m18, m16, 8
+ vpshldd m7, m21, m4, 16
+ vmovdqu8 m4{k1}, m21
+ vpshldd m19, m17, 8
+ pminuw m3, m6
+ paddd m16, m18
+ pmaxsw m4, m7
+ paddd m17, m19
+ psrldq m16, 1
+ palignr m16{k1}, m17, m17, 15
+ lea r6, [dstq+strideq*4]
+ pmaxsw m16, m3
+ pminsw m16, m4
+ pshufb m16, m0
+ movq [dstq+strideq*0], xm16
+ movhps [r6 +strideq*0], xm16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*1], xm17
+ movhps [r6 +strideq*1], xm17
+ vextracti32x4 xm17, m16, 2
+ movq [dstq+strideq*2], xm17
+ movhps [r6 +strideq*2], xm17
+ vextracti32x4 xm16, m16, 3
+ movq [dstq+r2 ], xm16
+ movhps [r6 +r2 ], xm16
+ RET
+.sec_only:
+ mov r4d, dirm
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym20, [base+end_perm4]
+ vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m19, m17, 8
+ paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddd m17, m19
+ vpermb m16, m20, m16
+ vpermb m17, m20, m17
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ vextracti128 xm16, ym16, 1
+ movq [dstq+strideq*2], xm16
+ movhps [dstq+r2 ], xm16
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm17
+ movhps [dstq+strideq*1], xm17
+ vextracti128 xm17, ym17, 1
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ mov r4d, r3d
+ or r3d, 0x0c
+ vmovdqa32 m0{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ or r4d, 0x04
+ vmovdqa32 m1{k1}, m6
+ kmovw k1, [base+edge_mask4-8+r4*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m7, m5, m9
+ mova m6, m0
+ vpermt2w m6, m7, m1 ; k0p0 k1p0 (top)
+ psubw m9, m5, m9
+ mova m8, m0
+ vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom)
+ CONSTRAIN m10, m6, m18, m12, m13, m14, m11
+ vpermt2w m8, m9, m1 ; k0p1 k1p1 (top)
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m7, m19, m12, m13, m14, m11
+ vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom)
+ vpdpwssd m17, m10, m15
+ CONSTRAIN m10, m8, m18, m12, m13, m14, m11
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m9, m19, m12, m13, m14, m11
+ vpdpwssd m17, m10, m15
+ ret
+
+cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs8
+ lea r6, [cdef_dirs8]
+ movu ym17, [dstq+strideq*0]
+ vinserti32x8 m17, [dstq+strideq*1], 1
+ movq xm4, [leftq+8*0]
+ movq xm5, [leftq+8*1]
+ psrld m2, [base+cdef_perm], 16
+ movq xm6, [leftq+8*2]
+ movq xm7, [leftq+8*3]
+ lea r2, [strideq*3]
+ movu ym16, [topq+strideq*0-4]
+ vinserti32x8 m16, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu ym18, [dstq+strideq*2]
+ vinserti32x8 m18, [dstq+r2 ], 1
+ movu ym19, [r3+strideq*0]
+ vinserti32x8 m19, [r3+strideq*1], 1
+ movu ym20, [r3+strideq*2]
+ vinserti32x8 m20, [r3+r2 ], 1
+ vshufi32x4 m0, m17, m18, q2020 ; px (top)
+ mov r3d, edgem
+ vshufi32x4 m1, m19, m20, q2020 ; px (bottom)
+ movifnidn prid, prim
+ vpermt2d m17, m2, m4
+ vpermt2d m18, m2, m5
+ pxor m12, m12
+ vpermt2d m19, m2, m6
+ vpermt2d m20, m2, m7
+ cmp r3d, 0x0f
+ jne .mask_edges
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+.main:
+ mova [rsp+64*0], m16 ; top
+ mova [rsp+64*1], m17 ; 0 1
+ mova [rsp+64*2], m18 ; 2 3
+ mova [rsp+64*3], m19 ; 4 5
+ mova [rsp+64*4], m20 ; 6 7
+ mova [rsp+64*5], m21 ; bottom
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ add r4d, r3d ; pri_shift
+ vpbroadcastw m14, r4d
+ mov r4d, dirm
+ vpbroadcastd m2, [base+pri_taps8+priq*2+0]
+ vpbroadcastd m3, [base+pri_taps8+priq*2+4]
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
+ pmaxsw m14, m12
+ call .constrain
+ mov r5d, secm
+ pmullw m16, m8, m2
+ pmullw m17, m9, m2
+ test r5d, r5d
+ jnz .pri_sec
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ call .constrain
+ pmullw m8, m3
+ pmullw m9, m3
+ jmp .end_no_clip
+.pri_sec:
+ lzcnt r5d, r5d
+ add r3d, r5d ; sec_shift
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ pminuw m18, m0, m4
+ pmaxsw m19, m0, m4
+ pminuw m20, m1, m5
+ pmaxsw m21, m1, m5
+ call .min_max_constrain2
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
+ pmullw m8, m3
+ pmullw m9, m3
+ vpbroadcastw m13, secm
+ vpbroadcastw m14, r3d
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
+ mova m2, m8
+ mova m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
+ paddw m2, m8
+ paddw m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
+ paddw m2, m2
+ paddw m3, m3
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m2
+ paddw m17, m3
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ paddw m16, m0
+ paddw m17, m1
+ pmaxsw m16, m18
+ pmaxsw m17, m20
+ pminsw m16, m19
+ pminsw m17, m21
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r4d, dirm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0]
+ vpbroadcastw m14, r3d
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0]
+ mova m16, m8
+ mova m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1]
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1]
+ paddw m16, m16
+ paddw m17, m17
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+.end_no_clip:
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ paddw m16, m0
+ paddw m17, m1
+.end:
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm17
+ vextracti128 [dstq+strideq*1], ym17, 1
+ vextracti32x4 [dstq+strideq*2], m17, 2
+ vextracti32x4 [dstq+r2 ], m17, 3
+ RET
+.mask_edges:
+ vpbroadcastd m2, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+ jmp .mask_edges_top
+.mask_edges_no_bottom:
+ mova m21, m2
+.mask_edges_top:
+ test r3b, 0x04
+ jnz .mask_edges_main
+ mova m16, m2
+.mask_edges_main:
+ and r3d, 0x03
+ cmp r3d, 0x03
+ je .main
+ kmovw k1, [base+edge_mask8+r3*2]
+ vmovdqa32 m16{k1}, m2 ; edge pixels = -16384
+ vmovdqa32 m17{k1}, m2
+ vmovdqa32 m18{k1}, m2
+ vmovdqa32 m19{k1}, m2
+ vmovdqa32 m20{k1}, m2
+ vmovdqa32 m21{k1}, m2
+ jmp .main
+ALIGN function_align
+.min_max_constrain:
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+.min_max_constrain2:
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+.constrain:
+ %define tmp rsp+gprsize+68
+ movu m4, [tmp+r5+64*0]
+ vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
+ movu m5, [tmp+r5+64*2]
+ vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
+ neg r5
+ movu m6, [tmp+r5+64*0]
+ vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
+ movu m7, [tmp+r5+64*2]
+ vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
+ CONSTRAIN m8, m4, m0, m12, m13, m14, m15
+ CONSTRAIN m9, m5, m1, m12, m13, m14, m15
+ CONSTRAIN m10, m6, m0, m12, m13, m14, m15
+ CONSTRAIN m11, m7, m1, m12, m13, m14, m15
+ paddw m8, m10
+ paddw m9, m11
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm
new file mode 100644
index 0000000000..1bd67ace64
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_sse.asm
@@ -0,0 +1,1033 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright (c) 2017-2021, The rav1e contributors
+; Copyright (c) 2021, Nathan Egge
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+%macro DUP8 1-*
+ %rep %0
+ times 8 dw %1
+ %rotate 1
+ %endrep
+%endmacro
+
+pri_taps: DUP8 4, 2, 3, 3
+dir_table: db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+ db 0 * 32 + 2, 0 * 32 + 4
+ db 0 * 32 + 2, 1 * 32 + 4
+ db 1 * 32 + 2, 2 * 32 + 4
+ db 1 * 32 + 0, 2 * 32 + 2
+ db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+
+dir_shift: times 4 dw 0x4000
+ times 4 dw 0x1000
+
+pw_128: times 4 dw 128
+pw_2048: times 8 dw 2048
+pw_m16384: times 8 dw -16384
+
+cextern cdef_dir_8bpc_ssse3.main
+cextern cdef_dir_8bpc_sse4.main
+cextern shufw_6543210x
+
+SECTION .text
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 3
+%elif WIN64
+DECLARE_REG_TMP 8, 4
+%else
+DECLARE_REG_TMP 8, 6
+%endif
+
+%macro CDEF_FILTER 2 ; w, h
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
+ mova m8, [base+pw_2048]
+%else
+ DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
+ %define m8 [base+pw_2048]
+ %define m9 [rsp+16*1+gprsize]
+ %define m10 [rsp+16*2+gprsize]
+%endif
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ test prid, prid
+ jz .sec_only
+ movd m6, r5m
+%if ARCH_X86_32
+ mov [rsp+24], pridmpd
+%endif
+ bsr pridmpd, prid
+ lea tmpd, [priq*4]
+ cmp dword r10m, 0x3ff ; if (bpc == 10)
+ cmove prid, tmpd ; pri <<= 2
+ mov tmpd, r8m ; damping
+ mov dird, r7m
+ and prid, 16
+ pshufb m6, m7 ; splat
+ lea dirq, [base+dir_table+dirq*2]
+ lea priq, [base+pri_taps+priq*2]
+ test secd, secd
+ jz .pri_only
+ mova [rsp], m6
+ movd m6, secd
+ tzcnt secd, secd
+ sub pridmpd, tmpd
+ sub tmpd, secd
+ pshufb m6, m7
+ xor secd, secd
+ neg pridmpd
+ cmovs pridmpd, secd
+%if ARCH_X86_32
+ mov [pri_shift+4], secd
+ mov [sec_shift+4], secd
+%endif
+ mov [pri_shift+0], pridmpq
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%if WIN64
+ movaps r4m, m9
+ movaps r6m, m10
+%elif ARCH_X86_32
+ mov pridmpd, [rsp+24]
+%endif
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps m9, r4m
+ movaps m10, r6m
+%endif
+ jmp .end
+.pri_only:
+ sub tmpd, pridmpd
+ cmovs tmpd, secd
+%if ARCH_X86_32
+ mov pridmpd, [rsp+24]
+ mov [pri_shift+4], secd
+%endif
+ mov [pri_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+ movd m6, r6m
+ tzcnt secd, secd
+ mov dird, r7m
+ pshufb m6, m7
+ sub tmpd, secd
+ lea dirq, [base+dir_table+dirq*2]
+%if ARCH_X86_32
+ mov [sec_shift+4], prid
+%endif
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
+ %else
+ DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
+ %endif
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, [priq+16*0] ; pri_tap_k0
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pabsw m4, m2
+%if ARCH_X86_64
+ pabsw m10, m3
+ pmaxsw m9, m2, m3
+ pminsw m10, m4
+%else
+ pabsw m7, m3
+ pmaxsw m5, m2, m3
+ pminsw m4, m7
+ mova m9, m5
+ mova m10, m4
+%endif
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ pabsw m7, m4
+ psignw m2, m3
+ pabsw m3, m5 ; constrain(diff_k0s1)
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ pabsw m7, m4
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, [rsp+gprsize], m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, [rsp+gprsize], m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4
+ paddw m7, m2 ; constrain(diff_k0)
+ pabsw m2, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m3
+ pmaxsw m9, m5
+ pminsw m10, m2
+%else
+ pminsw m3, m10
+ pminsw m3, m2
+ pmaxsw m2, m9, m4
+ pmaxsw m2, m5
+ mova m10, m3
+ mova m9, m2
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, [priq+16*0] ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, [rsp+16*0+gprsize], m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, [rsp+16*0+gprsize], m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if ARCH_X86_64
+ pmaxsw m9, m1
+ pminsw m0, m9
+%else
+ pmaxsw m2, m9, m1
+ pminsw m0, m2
+%endif
+ pminsw m1, m10
+ pmaxsw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+ %define px rsp+32*4
+%else
+cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
+ %define botq topq
+ %define px rsp+32*5
+%endif
+ %define base t0-dir_table
+ %define pri_shift px-16*6
+ %define sec_shift px-16*5
+ mov edged, r9m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ movddup m7, [base+pw_m16384]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*4+0], m0
+ mova [px+32*5+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*4+0], m7
+ mova [px+32*5+0], m7
+.bottom_no_left:
+ movd [px+32*4-4], m7
+ movd [px+32*5-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+%if ARCH_X86_64
+cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+%else
+cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r9m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m4, [t1 +strideq*0]
+ movu m5, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m6, [t1 +strideq*0]
+ movu m7, [t1 +strideq*1]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*8-4], m0
+ movd [px+32*9-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], m7
+ movd [px+32*9-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ movd [px+32*6-4], m2
+ movd [px+32*7-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+%if ARCH_X86_64
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+%else
+cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r9m
+ LEA t0, dir_table
+ mova m0, [dstq+strideq*0+ 0]
+ movd m1, [dstq+strideq*0+16]
+ mova m2, [dstq+strideq*1+ 0]
+ movd m3, [dstq+strideq*1+16]
+ lea t1, [dstq+strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova [px+32*0+ 0], m0
+ movd [px+32*0+16], m1
+ mova [px+32*1+ 0], m2
+ movd [px+32*1+16], m3
+ mova [px+32*2+ 0], m4
+ movd [px+32*2+16], m5
+ mova [px+32*3+ 0], m6
+ movd [px+32*3+16], m7
+ mova m0, [t1 +strideq*0+ 0]
+ movd m1, [t1 +strideq*0+16]
+ mova m2, [t1 +strideq*1+ 0]
+ movd m3, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ mova [px+32*4+ 0], m0
+ movd [px+32*4+16], m1
+ mova [px+32*5+ 0], m2
+ movd [px+32*5+16], m3
+ mova [px+32*6+ 0], m4
+ movd [px+32*6+16], m5
+ mova [px+32*7+ 0], m6
+ movd [px+32*7+16], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ mova m0, [topq+strideq*0+ 0]
+ mova m1, [topq+strideq*0+16]
+ mova m2, [topq+strideq*1+ 0]
+ mova m3, [topq+strideq*1+16]
+ mova [px-32*2+ 0], m0
+ movd [px-32*2+16], m1
+ mova [px-32*1+ 0], m2
+ movd [px-32*1+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+ 0], m7
+ movd [px-32*2+16], m7
+ mova [px-32*1+ 0], m7
+ movd [px-32*1+16], m7
+.top_no_left:
+ movd [px-32*2- 4], m7
+ movd [px-32*1- 4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ mova m0, [botq+strideq*0+ 0]
+ movd m1, [botq+strideq*0+16]
+ mova m2, [botq+strideq*1+ 0]
+ movd m3, [botq+strideq*1+16]
+ mova [px+32*8+ 0], m0
+ movd [px+32*8+16], m1
+ mova [px+32*9+ 0], m2
+ movd [px+32*9+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*8- 4], m0
+ movd [px+32*9- 4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+ 0], m7
+ movd [px+32*8+16], m7
+ mova [px+32*9+ 0], m7
+ movd [px+32*9+16], m7
+.bottom_no_left:
+ movd [px+32*8- 4], m7
+ movd [px+32*9- 4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0- 4], m0
+ movd [px+32*1- 4], m1
+ movd [px+32*2- 4], m2
+ movd [px+32*3- 4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4- 4], m0
+ movd [px+32*5- 4], m1
+ movd [px+32*6- 4], m2
+ movd [px+32*7- 4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+%macro CDEF_DIR 0
+%if ARCH_X86_64
+cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ movddup m7, [r6+bdmaxq*8]
+ lea r6, [strideq*3]
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ mova m2, [srcq+strideq*2]
+ mova m3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ mova m4, [srcq+strideq*0]
+ mova m5, [srcq+strideq*1]
+ mova m6, [srcq+strideq*2]
+ REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhuw m7, [srcq+r6 ]
+ pxor m8, m8
+ packuswb m9, m0, m1
+ packuswb m10, m2, m3
+ packuswb m11, m4, m5
+ packuswb m12, m6, m7
+ REPX {psadbw x, m8}, m9, m10, m11, m12
+ packssdw m9, m10
+ packssdw m11, m12
+ packssdw m9, m11
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+%else
+cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
+ mov bdmaxd, bdmaxm
+ LEA r2, dir_shift
+ shr bdmaxd, 11
+ movddup m7, [r2+bdmaxq*8]
+ lea r3, [strideq*3]
+ pmulhuw m3, m7, [srcq+strideq*0]
+ pmulhuw m4, m7, [srcq+strideq*1]
+ pmulhuw m5, m7, [srcq+strideq*2]
+ pmulhuw m6, m7, [srcq+r3 ]
+ movddup m1, [r2-dir_shift+pw_128]
+ lea srcq, [srcq+strideq*4]
+ pxor m0, m0
+ packuswb m2, m3, m4
+ psubw m3, m1
+ psubw m4, m1
+ mova [esp+0x00], m3
+ mova [esp+0x10], m4
+ packuswb m3, m5, m6
+ psadbw m2, m0
+ psadbw m3, m0
+ psubw m5, m1
+ psubw m6, m1
+ packssdw m2, m3
+ mova [esp+0x20], m5
+ mova [esp+0x50], m6
+ pmulhuw m4, m7, [srcq+strideq*0]
+ pmulhuw m5, m7, [srcq+strideq*1]
+ pmulhuw m6, m7, [srcq+strideq*2]
+ pmulhuw m7, [srcq+r3 ]
+ packuswb m3, m4, m5
+ packuswb m1, m6, m7
+ psadbw m3, m0
+ psadbw m1, m0
+ packssdw m3, m1
+ movddup m1, [r2-dir_shift+pw_128]
+ LEA r2, shufw_6543210x
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+%endif
+%endmacro
+
+INIT_XMM ssse3
+CDEF_DIR
+
+INIT_XMM sse4
+CDEF_DIR
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm
new file mode 100644
index 0000000000..1f30f8a3b7
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@@ -0,0 +1,1772 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+ dd %%base %+ .%2 - %%table
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1_8bpc, \
+ d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1
+%endmacro
+
+SECTION_RODATA 32
+
+pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
+blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+ dd 0x80, 0x00, 0x00
+blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ dd 0x00, 0x00
+blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000
+blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000, 0x0000
+blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
+div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+tap_table: ; masks for 8 bit shifts
+ db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ ; weights
+ db 4, 2, 3, 3, 2, 1
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
+SECTION .text
+
+%macro PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
+ lea dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, stride3, k
+ %else
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, dst4, stride3, k
+ lea dst4q, [dstq+strideq*4]
+ %endif
+%else
+ DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
+ table, dir, dirjmp, top2, stride3, k
+ mov hq, -8
+ lea top1q, [top1q+strideq*0]
+ lea top2q, [top1q+strideq*1]
+%endif
+%if %1 == 4
+ lea stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+ pxor m15, m15 ; sum
+%if %2 == 8
+ pxor m12, m12
+ %if %1 == 4
+ movd xm4, [dstq +strideq*0]
+ movd xm6, [dstq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm7, [dstq +stride3q ]
+ vinserti128 m4, [dst4q+strideq*0], 1
+ vinserti128 m6, [dst4q+strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*2], 1
+ vinserti128 m7, [dst4q+stride3q ], 1
+ punpckldq m4, m6
+ punpckldq m5, m7
+ %else
+ movq xm4, [dstq+strideq*0]
+ movq xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ %endif
+ punpcklqdq m4, m5
+%else
+ movd xm4, [dstq+strideq*0]
+ movd xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ punpckldq m4, m5
+%endif
+%if %3 == 1
+ mova m7, m4 ; min
+ mova m8, m4 ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, h, clip
+ ; load p0/p1
+ movsxd dirjmpq, [dirq+kq*4+%1*2*4]
+ add dirjmpq, tableq
+ call dirjmpq
+
+%if %8 == 1
+ pmaxub m7, m5
+ pminub m8, m5
+ pmaxub m7, m6
+ pminub m8, m6
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+ punpcklbw m5, m6
+ punpcklbw m6, m4, m4
+ psubusb m9, m5, m6
+ psubusb m5, m6, m5
+ por m9, m5 ; abs_diff_p01(p01 - px)
+ pcmpeqb m5, m9
+ por m5, %5
+ psignb m6, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m6
+ paddw m15, m5
+%else
+ psubusb m9, m5, m4
+ psubusb m5, m4, m5
+ psubusb m11, m6, m4
+ psubusb m6, m4, m6
+ por m9, m5 ; abs_diff_p0(p0 - px)
+ por m11, m6 ; abs_diff_p1(p1 - px)
+ pcmpeqb m5, m9
+ pcmpeqb m6, m11
+ punpckhbw m10, m9, m11
+ punpcklbw m9, m11
+ por m5, %5
+ por m11, m6, %5
+ punpckhbw m6, m5, m11
+ punpcklbw m5, m11
+ psignb m11, %5, m6
+ psrlw m6, m10, %2 ; emulate 8-bit shift
+ pand m6, %3
+ psubusb m6, %4, m6
+ pminub m6, m10
+ pmaddubsw m6, m11
+ paddw m12, m6
+ psignb m11, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m11
+ paddw m15, m5
+%endif
+%endmacro
+
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+ punpcklbw m4, %3
+ %endif
+ pcmpgtw %3, m15
+ paddw m15, %3
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m15
+ paddb m4, m15
+ %else
+ paddw m4, m15
+ packuswb m4, m4 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ pcmpgtw m6, %3, m12
+ pcmpgtw m5, %3, m15
+ paddw m12, m6
+ paddw m15, m5
+ %if %5 == 1
+ punpckhbw m5, m4, %3
+ punpcklbw m4, %3
+ %endif
+ pmulhrsw m12, %4
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m12
+ paddb m4, m15
+ %else
+ paddw m5, m12
+ paddw m4, m15
+ packuswb m4, m5 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ %if %1 == 4
+ movd [dstq +strideq*0], xm4
+ movd [dst4q+strideq*0], xm5
+ pextrd [dstq +strideq*1], xm4, 1
+ pextrd [dst4q+strideq*1], xm5, 1
+ pextrd [dstq +strideq*2], xm4, 2
+ pextrd [dst4q+strideq*2], xm5, 2
+ pextrd [dstq +stride3q ], xm4, 3
+ pextrd [dst4q+stride3q ], xm5, 3
+ %else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
+ %else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
+ %endif
+ mov hd, %1*%2*2/mmsize
+%else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
+%endif
+ lea stkq, [px]
+ pxor m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+%if %1 == 4
+ movq xm4, [stkq+32*0]
+ movhps xm4, [stkq+32*1]
+ movq xm5, [stkq+32*2]
+ movhps xm5, [stkq+32*3]
+ vinserti128 m4, xm5, 1
+%else
+ mova xm4, [stkq+32*0] ; px
+ vinserti128 m4, [stkq+32*1], 1
+%endif
+ pxor m15, m15 ; sum
+%if %3 == 1
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, clip
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1] ; off1
+%if %6 == 4
+ movq xm5, [stkq+offq*2+32*0] ; p0
+ movq xm6, [stkq+offq*2+32*2]
+ movhps xm5, [stkq+offq*2+32*1]
+ movhps xm6, [stkq+offq*2+32*3]
+ vinserti128 m5, xm6, 1
+%else
+ movu xm5, [stkq+offq*2+32*0] ; p0
+ vinserti128 m5, [stkq+offq*2+32*1], 1
+%endif
+ neg offq ; -off1
+%if %6 == 4
+ movq xm6, [stkq+offq*2+32*0] ; p1
+ movq xm9, [stkq+offq*2+32*2]
+ movhps xm6, [stkq+offq*2+32*1]
+ movhps xm9, [stkq+offq*2+32*3]
+ vinserti128 m6, xm9, 1
+%else
+ movu xm6, [stkq+offq*2+32*0] ; p1
+ vinserti128 m6, [stkq+offq*2+32*1], 1
+%endif
+%if %7 == 1
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5 ; max after p0
+ pminuw m8, m5 ; min after p0
+ pmaxsw m7, m6 ; max after p1
+ pminuw m8, m6 ; min after p1
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+ ; calculate difference before converting
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+
+ ; convert to 8-bits with signed saturation
+ ; saturating to large diffs has no impact on the results
+ packsswb m5, m6
+
+ ; group into pairs so we can accumulate using maddubsw
+ pshufb m5, m12
+ pabsb m9, m5
+ psignb m10, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+
+ ; use unsigned min since abs diff can equal 0x80
+ pminub m5, m9
+ pmaddubsw m5, m10
+ paddw m15, m5
+%endmacro
+
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+ pcmpgtw m9, m11, m15
+ paddw m15, m9
+ pmulhrsw m15, %2
+ paddw m4, m15
+%if %3 == 1
+ pminsw m4, m7
+ pmaxsw m4, m8
+%endif
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+%if %1 == 4
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+INIT_YMM avx2
+cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+ mov edged, edgem
+ cmp edged, 0xf
+ jne .border_block
+
+ PUSH r11
+ PUSH r12
+%if %2 == 4
+%assign regs_used 13
+ ALLOC_STACK 0x60, 16
+ pmovzxbw xm0, [leftq+1]
+ vpermq m0, m0, q0110
+ psrldq m1, m0, 4
+ vpalignr m2, m0, m0, 12
+ movu [rsp+0x10], m0
+ movu [rsp+0x28], m1
+ movu [rsp+0x40], m2
+%elif %1 == 4
+%assign regs_used 14
+ PUSH r13
+ ALLOC_STACK 8*2+%1*%2*1, 16
+ pmovzxwd m0, [leftq]
+ mova [rsp+0x10], m0
+%else
+%assign regs_used 15
+ PUSH r13
+ PUSH r14
+ ALLOC_STACK 8*4+%1*%2*2+32, 16
+ lea r11, [strideq*3]
+ movu xm4, [dstq+strideq*2]
+ pmovzxwq m0, [leftq+0]
+ pmovzxwq m1, [leftq+8]
+ vinserti128 m4, [dstq+r11], 1
+ pmovzxbd m2, [leftq+1]
+ pmovzxbd m3, [leftq+9]
+ mov [rsp+16], botq
+ mova [rsp+0x20], m0
+ mova [rsp+0x40], m1
+ mova [rsp+0x60], m2
+ mova [rsp+0x80], m3
+ mova [rsp+0xa0], m4
+ lea botq, [dstq+strideq*4]
+%endif
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ PREP_REGS %1, %2
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+ LOAD_BLOCK %1, %2, 1
+.k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
+ dec kq
+ jge .k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ pxor m9, m9
+ ADJUST_PIXEL %1, %2, m9, m10, 1
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .v_loop
+%endif
+ RET
+
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m3, [pw_2048]
+ pxor m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+ dec kq
+ jge .pri_k_loop
+ ADJUST_PIXEL %1, %2, m1, m3
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .pri_v_loop
+%endif
+ RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m2, [pw_2048]
+ pxor m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ dec kq
+ jge .sec_k_loop
+ ADJUST_PIXEL %1, %2, m0, m2
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .sec_v_loop
+%endif
+ RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-1]
+ vpbroadcastq m10, [dstq+strideq*2-1]
+ movd xm5, [topq+strideq*1+1]
+ movd xm9, [dstq+strideq*0+1]
+ psrldq m11, m6, 2
+ psrldq m12, m10, 2
+ vinserti128 m6, [dstq+stride3q -1], 1
+ vinserti128 m10, [botq -1], 1
+ vpblendd m5, m11, 0x10
+ vpblendd m9, m12, 0x10
+ movu m11, [blend_4x4+16]
+ punpckldq m6, m10
+ punpckldq m5, m9
+ vpblendvb m6, [rsp+gprsize+0x28], m11
+ %else
+ movd xm5, [topq +strideq*1+1]
+ movq xm6, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm5, [dstq +strideq*0+1], 1
+ movhps xm6, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm9, xm6, 2
+ shufps xm5, xm9, q2010 ; -1 +0 +1 +2
+ shufps xm6, xm10, q2020 ; +1 +2 +3 +4
+ psrldq xm9, xm11, 2
+ psrldq xm10, 2
+ shufps xm10, xm9, q2020 ; +3 +4 +5 +6
+ movd xm9, [dst4q+stride3q -1]
+ pinsrd xm9, [botq -1], 1
+ shufps xm11, xm9, q1020 ; +5 +6 +7 +8
+ pmovzxbw m9, [leftq+3]
+ vinserti128 m6, xm11, 1
+ movu m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [top2q +1]
+ vbroadcasti128 m10, [dstq+strideq*1-1]
+ vbroadcasti128 m11, [dstq+strideq*2-1]
+ movhps xm5, [dstq+strideq*0+1]
+ vinserti128 m6, m10, [dstq+stride3q-1], 1
+ vinserti128 m9, m11, [botq -1], 1
+ psrldq m10, 2
+ psrldq m11, 2
+ punpcklqdq m6, m9
+ movu m9, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m11
+ vpblendd m5, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
+%endif
+ ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+ movq xm6, [dstq+strideq*0-1]
+ movq xm9, [dstq+strideq*1-1]
+ vinserti128 m6, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m11, [rsp+gprsize+0x10]
+ pcmpeqd m12, m12
+ psrldq m5, m6, 2
+ psrldq m10, m9, 2
+ psrld m12, 24
+ punpckldq m6, m9
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movq xm6, [dstq +strideq*0-1]
+ movq xm9, [dstq +strideq*2-1]
+ movhps xm6, [dstq +strideq*1-1]
+ movhps xm9, [dstq +stride3q -1]
+ movq xm10, [dst4q+strideq*0-1]
+ movhps xm10, [dst4q+strideq*1-1]
+ psrldq xm5, xm6, 2
+ psrldq xm11, xm9, 2
+ shufps xm5, xm11, q2020
+ movq xm11, [dst4q+strideq*2-1]
+ movhps xm11, [dst4q+stride3q -1]
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, xm11, q2020
+ vinserti128 m6, xm9, 1
+ pmovzxbw m9, [leftq+1]
+ psrldq xm10, 2
+ psrldq xm11, 2
+ shufps xm10, xm11, q2020
+ vpbroadcastd m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ movu xm5, [dstq+strideq*0-1]
+ movu xm9, [dstq+strideq*1-1]
+ vinserti128 m5, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m10, [blend_8x8_0+16]
+ punpcklqdq m6, m5, m9
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m10, [dstq+strideq*1-1]
+ vpbroadcastq m11, [dstq+strideq*2-1]
+ movd xm6, [topq+strideq*1-1]
+ movd xm9, [dstq+strideq*0-1]
+ psrldq m5, m10, 2
+ psrldq m12, m11, 2
+ vpblendd m6, m10, 0x10
+ vpblendd m9, m11, 0x10
+ movu m10, [blend_4x4]
+ vinserti128 m5, [dstq+stride3q +1], 1
+ vinserti128 m12, [botq +1], 1
+ punpckldq m6, m9
+ punpckldq m5, m12
+ vpblendvb m6, [rsp+gprsize+0x40], m10
+ %else
+ movd xm6, [topq +strideq*1-1]
+ movq xm9, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm6, [dstq +strideq*0-1], 1
+ movhps xm9, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm5, xm9, 2
+ shufps xm6, xm9, q2010
+ psrldq xm9, xm10, 2
+ shufps xm5, xm9, q2020
+ shufps xm10, xm11, q2020
+ movd xm9, [dst4q+stride3q +1]
+ vinserti128 m6, xm10, 1
+ pinsrd xm9, [botq +1], 1
+ psrldq xm11, 2
+ pmovzxbw m10, [leftq-1]
+ shufps xm11, xm9, q1020
+ movu m9, [blend_4x8_0]
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, m10, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0+8]
+ movq xm6, [top2q -1]
+ vbroadcasti128 m5, [dstq+strideq*1-1]
+ vbroadcasti128 m9, [dstq+strideq*2-1]
+ movhps xm6, [dstq+strideq*0-1]
+ movu m11, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m5, m9
+ vinserti128 m5, [dstq+stride3q -1], 1
+ vinserti128 m9, [botq -1], 1
+ vpblendd m6, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*1 ]
+ vpbroadcastd m5, [dstq+strideq*1 ]
+ vpbroadcastd m9, [dstq+strideq*2 ]
+ vpblendd xm6, [dstq+strideq*0-4], 0x2
+ vpblendd m5, m9, 0x22
+ vpblendd m6, m5, 0x30
+ vinserti128 m5, [dstq+stride3q ], 1
+ vpblendd m5, [botq -20], 0x20
+ %else
+ movd xm6, [topq +strideq*1]
+ movd xm5, [dstq +strideq*1]
+ movd xm9, [dstq +stride3q ]
+ movd xm10, [dst4q+strideq*1]
+ movd xm11, [dst4q+stride3q ]
+ pinsrd xm6, [dstq +strideq*0], 1
+ pinsrd xm5, [dstq +strideq*2], 1
+ pinsrd xm9, [dst4q+strideq*0], 1
+ pinsrd xm10, [dst4q+strideq*2], 1
+ pinsrd xm11, [botq ], 1
+ punpcklqdq xm6, xm5
+ punpcklqdq xm5, xm9
+ punpcklqdq xm9, xm10
+ punpcklqdq xm10, xm11
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ %endif
+%else
+ movq xm6, [top2q ]
+ movq xm5, [dstq+strideq*1]
+ movq xm9, [dstq+stride3q ]
+ movhps xm6, [dstq+strideq*0]
+ movhps xm5, [dstq+strideq*2]
+ movhps xm9, [botq ]
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+%endif
+ ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [dstq+strideq*2-2]
+ movd xm9, [dstq+stride3q -2]
+ movd xm5, [topq+strideq*0+2]
+ movd xm10, [topq+strideq*1+2]
+ pinsrw xm6, [leftq+4], 0
+ pinsrw xm9, [leftq+6], 0
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movq xm6, [dstq +strideq*2-2]
+ movd xm10, [dst4q+strideq*2-2]
+ movd xm5, [topq +strideq*0+2]
+ movq xm9, [dst4q+strideq*0-2]
+ movhps xm6, [dstq +stride3q -2]
+ pinsrw xm10, [dst4q+stride3q ], 3
+ pinsrd xm5, [topq +strideq*1+2], 1
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [botq +strideq*0-2], 2
+ pinsrd xm5, [dstq +strideq*0+2], 2
+ pinsrd xm10, [botq +strideq*1-2], 3
+ pinsrd xm5, [dstq +strideq*1+2], 3
+ shufps xm11, xm6, xm9, q3131
+ shufps xm6, xm9, q2020
+ movu m9, [blend_4x8_3+8]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, [rsp+gprsize+0x10+8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm6, [dstq+strideq*2-2]
+ movq xm9, [dstq+stride3q -2]
+ movq xm5, [top1q +2]
+ movq xm10, [top2q +2]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
+%endif
+ ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-2]
+ vpbroadcastq m9, [dstq+strideq*2-2]
+ movd xm5, [topq+strideq*1+2]
+ movd xm10, [dstq+strideq*0+2]
+ psrldq m11, m6, 4
+ psrldq m12, m9, 4
+ vpblendd m5, m11, 0x10
+ movq xm11, [leftq+2]
+ vinserti128 m6, [dstq+stride3q-2], 1
+ punpckldq xm11, xm11
+ vpblendd m10, m12, 0x10
+ pcmpeqd m12, m12
+ pmovzxwd m11, xm11
+ psrld m12, 16
+ punpckldq m6, m9
+ vpbroadcastd m9, [botq-2]
+ vpblendvb m6, m11, m12
+ punpckldq m5, m10
+ vpblendd m6, m9, 0x20
+ %else
+ movd xm5, [topq +strideq*1+2]
+ movq xm6, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q -2]
+ pinsrd xm5, [dstq +strideq*0+2], 1
+ movhps xm6, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq -2], 1
+ shufps xm5, xm6, q3110
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, q3131
+ shufps xm10, xm11, q1020
+ movu m11, [blend_4x8_2+4]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm9, 1
+ vpblendvb m6, [rsp+gprsize+0x10+4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm5, [top2q +2]
+ vbroadcasti128 m6, [dstq+strideq*1-2]
+ vbroadcasti128 m9, [dstq+strideq*2-2]
+ movhps xm5, [dstq+strideq*0+2]
+ shufps m10, m6, m9, q2121
+ vinserti128 m6, [dstq+stride3q -2], 1
+ vinserti128 m9, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m5, m10, 0xF0
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
+%endif
+ ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+ movq xm11, [leftq]
+ movq xm6, [dstq+strideq*0-2]
+ movq xm9, [dstq+strideq*1-2]
+ vinserti128 m6, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ punpckldq xm11, xm11
+ psrldq m5, m6, 4
+ psrldq m10, m9, 4
+ pmovzxwd m11, xm11
+ punpckldq m6, m9
+ punpckldq m5, m10
+ pblendw m6, m11, 0x05
+ %else
+ movq xm5, [dstq +strideq*0-2]
+ movq xm9, [dstq +strideq*2-2]
+ movq xm10, [dst4q+strideq*0-2]
+ movq xm11, [dst4q+strideq*2-2]
+ movhps xm5, [dstq +strideq*1-2]
+ movhps xm9, [dstq +stride3q -2]
+ movhps xm10, [dst4q+strideq*1-2]
+ movhps xm11, [dst4q+stride3q -2]
+ shufps xm6, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, xm11, q2020
+ shufps xm10, xm11, q3131
+ pmovzxwd m11, [leftq]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ pblendw m6, m11, 0x55
+ %endif
+%else
+ mova m11, [rsp+gprsize+0x20+hq*8+64]
+ movu xm5, [dstq+strideq*0-2]
+ movu xm9, [dstq+strideq*1-2]
+ vinserti128 m5, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ shufps m6, m5, m9, q1010
+ shufps m5, m9, q2121
+ pblendw m6, m11, 0x11
+%endif
+ ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m11, [dstq+strideq*1-2]
+ vpbroadcastq m12, [dstq+strideq*2-2]
+ movd xm6, [topq+strideq*1-2]
+ movd xm9, [dstq+strideq*0-2]
+ pblendw m11, [leftq-16+2], 0x01
+ pblendw m12, [leftq-16+4], 0x01
+ pinsrw xm9, [leftq- 0+0], 0
+ psrldq m5, m11, 4
+ psrldq m10, m12, 4
+ vinserti128 m5, [dstq+stride3q +2], 1
+ vinserti128 m10, [botq +2], 1
+ vpblendd m6, m11, 0x10
+ vpblendd m9, m12, 0x10
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm6, [topq +strideq*1-2]
+ movq xm5, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q +2]
+ pinsrw xm6, [dstq +strideq*0 ], 3
+ movhps xm5, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq +2], 1
+ shufps xm6, xm5, q2010
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, q2020
+ shufps xm10, xm11, q1031
+ movu m11, [blend_4x8_2]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+8]
+ movq xm6, [top2q -2]
+ vbroadcasti128 m5, [dstq+strideq*1-2]
+ vbroadcasti128 m10, [dstq+strideq*2-2]
+ movhps xm6, [dstq+strideq*0-2]
+ punpcklqdq m9, m5, m10
+ vinserti128 m5, [dstq+stride3q -2], 1
+ vinserti128 m10, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m6, m9, 0xF0
+ shufps m5, m10, q2121
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
+%endif
+ ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ movd xm5, [dstq+strideq*2+2]
+ movd xm10, [dstq+stride3q +2]
+ pblendw m6, [leftq-16+0], 0x01
+ pblendw m9, [leftq-16+2], 0x01
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ vpblendd m6, [topq+strideq*0-2], 0x01
+ vpblendd m9, [topq+strideq*1-2], 0x01
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movd xm6, [topq +strideq*0-2]
+ movq xm5, [dstq +strideq*2-2]
+ movq xm9, [dst4q+strideq*0-2]
+ movd xm10, [dst4q+strideq*2+2]
+ pinsrd xm6, [topq +strideq*1-2], 1
+ movhps xm5, [dstq +stride3q -2]
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [dst4q+stride3q +2], 1
+ pinsrd xm6, [dstq +strideq*0-2], 2
+ pinsrd xm10, [botq +strideq*0+2], 2
+ pinsrd xm6, [dstq +strideq*1-2], 3
+ pinsrd xm10, [botq +strideq*1+2], 3
+ shufps xm11, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ movu m9, [blend_4x8_3]
+ vinserti128 m6, xm11, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -2]
+ movq xm9, [top2q -2]
+ movq xm5, [dstq+strideq*2+2]
+ movq xm10, [dstq+stride3q +2]
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
+ punpcklqdq m5, m10
+%endif
+ ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0-1]
+ movd xm9, [topq+strideq*1-1]
+ movd xm5, [dstq+strideq*2+1]
+ movd xm10, [dstq+stride3q +1]
+ pcmpeqd m12, m12
+ pmovzxbw m11, [leftq-8+1]
+ psrld m12, 24
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpckldq m6, m9
+ pxor m9, m9
+ vpblendd m12, m9, 0x0F
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movd xm6, [topq +strideq*0-1]
+ movq xm5, [dstq +strideq*2-1]
+ movq xm9, [dst4q+strideq*0-1]
+ movd xm10, [dst4q+strideq*2+1]
+ pinsrd xm6, [topq +strideq*1-1], 1
+ movhps xm5, [dstq +stride3q -1]
+ movhps xm9, [dst4q+strideq*1-1]
+ pinsrd xm10, [dst4q+stride3q +1], 1
+ pinsrd xm6, [dstq +strideq*0-1], 2
+ pinsrd xm10, [botq +strideq*0+1], 2
+ pinsrd xm6, [dstq +strideq*1-1], 3
+ pinsrd xm10, [botq +strideq*1+1], 3
+ shufps xm11, xm5, xm9, q2020
+ vinserti128 m6, xm11, 1
+ pmovzxbw m11, [leftq-3]
+ psrldq xm5, 2
+ psrldq xm9, 2
+ shufps xm5, xm9, q2020
+ movu m9, [blend_4x8_1]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m11, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -1]
+ movq xm9, [top2q -1]
+ movq xm5, [dstq+strideq*2+1]
+ movq xm10, [dstq+stride3q +1]
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
+%endif
+ ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0]
+ movd xm9, [topq+strideq*1]
+ movd xm5, [dstq+strideq*2]
+ movd xm10, [dstq+stride3q ]
+ vinserti128 m6, [dstq+strideq*0], 1
+ vinserti128 m9, [dstq+strideq*1], 1
+ vinserti128 m5, [botq+strideq*0], 1
+ vinserti128 m10, [botq+strideq*1], 1
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm5, [dstq +strideq*2]
+ movd xm6, [topq +strideq*0]
+ movd xm9, [dst4q+strideq*2]
+ pinsrd xm5, [dstq +stride3q ], 1
+ pinsrd xm6, [topq +strideq*1], 1
+ pinsrd xm9, [dst4q+stride3q ], 1
+ pinsrd xm5, [dst4q+strideq*0], 2
+ pinsrd xm6, [dstq +strideq*0], 2
+ pinsrd xm9, [botq +strideq*0], 2
+ pinsrd xm5, [dst4q+strideq*1], 3
+ pinsrd xm6, [dstq +strideq*1], 3
+ pinsrd xm9, [botq +strideq*1], 3
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+ %endif
+%else
+ movq xm5, [dstq+strideq*2]
+ movq xm9, [botq+strideq*0]
+ movq xm6, [top1q ]
+ movq xm10, [dstq+strideq*0]
+ movhps xm5, [dstq+stride3q ]
+ movhps xm9, [botq+strideq*1]
+ movhps xm6, [top2q ]
+ movhps xm10, [dstq+strideq*1]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+%endif
+ ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm5, [dstq+strideq*2-1]
+ movd xm9, [dstq+stride3q -1]
+ movd xm6, [topq+strideq*0+1]
+ movd xm10, [topq+strideq*1+1]
+ pinsrb xm5, [leftq+ 5], 0
+ pinsrb xm9, [leftq+ 7], 0
+ vinserti128 m6, [dstq+strideq*0+1], 1
+ vinserti128 m10, [dstq+strideq*1+1], 1
+ vinserti128 m5, [botq+strideq*0-1], 1
+ vinserti128 m9, [botq+strideq*1-1], 1
+ punpckldq m6, m10
+ punpckldq m5, m9
+ %else
+ movd xm6, [topq +strideq*0+1]
+ movq xm9, [dstq +strideq*2-1]
+ movq xm10, [dst4q+strideq*0-1]
+ movd xm11, [dst4q+strideq*2-1]
+ pinsrd xm6, [topq +strideq*1+1], 1
+ movhps xm9, [dstq +stride3q -1]
+ movhps xm10, [dst4q+strideq*1-1]
+ pinsrd xm11, [dst4q+stride3q -1], 1
+ pinsrd xm6, [dstq +strideq*0+1], 2
+ pinsrd xm11, [botq +strideq*0-1], 2
+ pinsrd xm6, [dstq +strideq*1+1], 3
+ pinsrd xm11, [botq +strideq*1-1], 3
+ shufps xm5, xm9, xm10, q2020
+ vinserti128 m5, xm11, 1
+ pmovzxbw m11, [leftq+5]
+ psrldq xm9, 2
+ psrldq xm10, 2
+ shufps xm9, xm10, q2020
+ movu m10, [blend_4x8_1+8]
+ vinserti128 m6, xm9, 1
+ vpblendvb m5, m11, m10
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [dstq+strideq*2-1]
+ movq xm9, [botq+strideq*0-1]
+ movq xm6, [top1q +1]
+ movq xm10, [dstq+strideq*0+1]
+ movhps xm5, [dstq+stride3q -1]
+ movhps xm9, [botq+strideq*1-1]
+ movhps xm6, [top2q +1]
+ movhps xm10, [dstq+strideq*1+1]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+ vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
+%endif
+ ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%assign regs_used 11
+ ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
+ pcmpeqw m14, m14
+ psllw m14, 15 ; 0x8000
+
+ ; prepare pixel buffers - body/right
+%if %1 == 4
+ INIT_XMM avx2
+%endif
+%if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+%endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ pmovzxbw m1, [dstq+strideq*0]
+ pmovzxbw m2, [dstq+strideq*1]
+ pmovzxbw m3, [dstq+strideq*2]
+ pmovzxbw m4, [dstq+stride3q]
+ mova [px+0*32], m1
+ mova [px+1*32], m2
+ mova [px+2*32], m3
+ mova [px+3*32], m4
+%if %2 == 8
+ pmovzxbw m1, [dst4q+strideq*0]
+ pmovzxbw m2, [dst4q+strideq*1]
+ pmovzxbw m3, [dst4q+strideq*2]
+ pmovzxbw m4, [dst4q+stride3q]
+ mova [px+4*32], m1
+ mova [px+5*32], m2
+ mova [px+6*32], m3
+ mova [px+7*32], m4
+%endif
+ jmp .body_done
+.no_right:
+%if %1 == 4
+ movd xm1, [dstq+strideq*0]
+ movd xm2, [dstq+strideq*1]
+ movd xm3, [dstq+strideq*2]
+ movd xm4, [dstq+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+0*32], xm1
+ movq [px+1*32], xm2
+ movq [px+2*32], xm3
+ movq [px+3*32], xm4
+%else
+ pmovzxbw xm1, [dstq+strideq*0]
+ pmovzxbw xm2, [dstq+strideq*1]
+ pmovzxbw xm3, [dstq+strideq*2]
+ pmovzxbw xm4, [dstq+stride3q]
+ mova [px+0*32], xm1
+ mova [px+1*32], xm2
+ mova [px+2*32], xm3
+ mova [px+3*32], xm4
+%endif
+ movd [px+0*32+%1*2], xm14
+ movd [px+1*32+%1*2], xm14
+ movd [px+2*32+%1*2], xm14
+ movd [px+3*32+%1*2], xm14
+%if %2 == 8
+ %if %1 == 4
+ movd xm1, [dst4q+strideq*0]
+ movd xm2, [dst4q+strideq*1]
+ movd xm3, [dst4q+strideq*2]
+ movd xm4, [dst4q+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+4*32], xm1
+ movq [px+5*32], xm2
+ movq [px+6*32], xm3
+ movq [px+7*32], xm4
+ %else
+ pmovzxbw xm1, [dst4q+strideq*0]
+ pmovzxbw xm2, [dst4q+strideq*1]
+ pmovzxbw xm3, [dst4q+strideq*2]
+ pmovzxbw xm4, [dst4q+stride3q]
+ mova [px+4*32], xm1
+ mova [px+5*32], xm2
+ mova [px+6*32], xm3
+ mova [px+7*32], xm4
+ %endif
+ movd [px+4*32+%1*2], xm14
+ movd [px+5*32+%1*2], xm14
+ movd [px+6*32+%1*2], xm14
+ movd [px+7*32+%1*2], xm14
+%endif
+.body_done:
+
+ ; top
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ pmovzxbw m1, [topq+strideq*0-(%1/2)]
+ pmovzxbw m2, [topq+strideq*1-(%1/2)]
+ movu [px-2*32-%1], m1
+ movu [px-1*32-%1], m2
+ jmp .top_done
+.top_no_right:
+ pmovzxbw m1, [topq+strideq*0-%1]
+ pmovzxbw m2, [topq+strideq*1-%1]
+ movu [px-2*32-%1*2], m1
+ movu [px-1*32-%1*2], m2
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ pmovzxbw m1, [topq+strideq*0]
+ pmovzxbw m2, [topq+strideq*1]
+ mova [px-2*32+0], m1
+ mova [px-1*32+0], m2
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ jmp .top_done
+.top_no_left_right:
+%if %1 == 4
+ movd xm1, [topq+strideq*0]
+ pinsrd xm1, [topq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px-2*32+0], xm1
+ movhps [px-1*32+0], xm1
+%else
+ pmovzxbw xm1, [topq+strideq*0]
+ pmovzxbw xm2, [topq+strideq*1]
+ mova [px-2*32+0], xm1
+ mova [px-1*32+0], xm2
+%endif
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.no_top:
+ movu [px-2*32-%1], m14
+ movu [px-1*32-%1], m14
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ pmovzxbw xm1, [leftq+ 0]
+%if %2 == 8
+ pmovzxbw xm2, [leftq+ 8]
+%endif
+ movd [px+0*32-4], xm1
+ pextrd [px+1*32-4], xm1, 1
+ pextrd [px+2*32-4], xm1, 2
+ pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
+ movd [px+4*32-4], xm2
+ pextrd [px+5*32-4], xm2, 1
+ pextrd [px+6*32-4], xm2, 2
+ pextrd [px+7*32-4], xm2, 3
+%endif
+ jmp .left_done
+.no_left:
+ movd [px+0*32-4], xm14
+ movd [px+1*32-4], xm14
+ movd [px+2*32-4], xm14
+ movd [px+3*32-4], xm14
+%if %2 == 8
+ movd [px+4*32-4], xm14
+ movd [px+5*32-4], xm14
+ movd [px+6*32-4], xm14
+ movd [px+7*32-4], xm14
+%endif
+.left_done:
+
+ ; bottom
+ DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ pmovzxbw m1, [botq+strideq*0-(%1/2)]
+ pmovzxbw m2, [botq+strideq*1-(%1/2)]
+ movu [px+(%2+0)*32-%1], m1
+ movu [px+(%2+1)*32-%1], m2
+ jmp .bottom_done
+.bottom_no_right:
+ pmovzxbw m1, [botq+strideq*0-%1]
+ pmovzxbw m2, [botq+strideq*1-%1]
+ movu [px+(%2+0)*32-%1*2], m1
+ movu [px+(%2+1)*32-%1*2], m2
+%if %1 == 8
+ movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
+%endif
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ pmovzxbw m1, [botq+strideq*0]
+ pmovzxbw m2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], m1
+ mova [px+(%2+1)*32+0], m2
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ jmp .bottom_done
+.bottom_no_left_right:
+%if %1 == 4
+ movd xm1, [botq+strideq*0]
+ pinsrd xm1, [botq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px+(%2+0)*32+0], xm1
+ movhps [px+(%2+1)*32+0], xm1
+%else
+ pmovzxbw xm1, [botq+strideq*0]
+ pmovzxbw xm2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], xm1
+ mova [px+(%2+1)*32+0], xm2
+%endif
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.no_bottom:
+ movu [px+(%2+0)*32-%1], m14
+ movu [px+(%2+1)*32-%1], m14
+.bottom_done:
+
+ ; actual filter
+ INIT_YMM avx2
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
+%undef edged
+ ; register to shuffle values into after packing
+ vbroadcasti128 m12, [shufb_lohi]
+
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .border_sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .border_pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ BORDER_PREP_REGS %1, %2
+%if %1*%2*2/mmsize > 1
+.border_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2, 1
+.border_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
+ dec kq
+ jge .border_k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_v_loop
+%endif
+ RET
+
+.border_pri_only:
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m1, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+ dec kq
+ jge .border_pri_k_loop
+ BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_pri_v_loop
+%endif
+ RET
+
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+ dec kq
+ jge .border_sec_k_loop
+ BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_sec_v_loop
+%endif
+ RET
+%endmacro
+
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+
+INIT_YMM avx2
+cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
+ lea stride3q, [strideq*3]
+ movq xm0, [srcq+strideq*0]
+ movq xm1, [srcq+strideq*1]
+ movq xm2, [srcq+strideq*2]
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+stride3q ]
+ vpbroadcastq m5, [srcq+strideq*2]
+ vpblendd m0, m4, 0xf0
+ vpblendd m1, m5, 0xf0
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m5, [srcq+strideq*0]
+ vpblendd m2, m4, 0xf0
+ vpblendd m3, m5, 0xf0
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+cglobal_label .main
+ vpbroadcastd m4, [pw_128]
+ PROLOGUE 3, 4, 15
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ ; shuffle registers to generate partial_sum_diag[0-1] together
+ vperm2i128 m7, m0, m0, 0x01
+ vperm2i128 m6, m1, m1, 0x01
+ vperm2i128 m5, m2, m2, 0x01
+ vperm2i128 m4, m3, m3, 0x01
+
+ ; start with partial_sum_hv[0-1]
+ paddw m8, m0, m1
+ paddw m9, m2, m3
+ phaddw m10, m0, m1
+ phaddw m11, m2, m3
+ paddw m8, m9
+ phaddw m10, m11
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ paddw xm8, xm9 ; partial_sum_hv[1]
+ phaddw xm10, xm11 ; partial_sum_hv[0]
+ vinserti128 m8, xm10, 1
+ vpbroadcastd m9, [div_table+44]
+ pmaddwd m8, m8
+ pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
+
+ ; create aggregates [lower half]:
+ ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+ ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+ ; and [upper half]:
+ ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+ ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+ ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
+ vbroadcasti128 m14, [shufw_6543210x]
+ vbroadcasti128 m13, [div_table+16]
+ vbroadcasti128 m12, [div_table+0]
+ paddw m9, m0 ; partial_sum_diag[0/1][0-7]
+ pshufb m10, m14
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ pmulld m11, m13
+ pmulld m9, m12
+ paddd m9, m11 ; cost0[a-d] | cost4[a-d]
+
+ ; merge horizontally and vertically for partial_sum_alt[0-3]
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; create aggregates [lower half]:
+ ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+ ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+ ; and [upper half]:
+ ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+ ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m4, m11, 2
+ psrldq m11, 14
+ pslldq m5, m12, 4
+ psrldq m12, 12
+ pslldq m6, m13, 6
+ psrldq m13, 10
+ paddw m4, m10
+ paddw m11, m12
+ vpbroadcastd m12, [div_table+44]
+ paddw m5, m6
+ paddw m11, m13 ; partial_sum_alt[3/2] right
+ vbroadcasti128 m13, [div_table+32]
+ paddw m4, m5 ; partial_sum_alt[3/2] left
+ pshuflw m5, m11, q3012
+ punpckhwd m6, m11, m4
+ punpcklwd m4, m5
+ pmaddwd m6, m6
+ pmaddwd m4, m4
+ pmulld m6, m12
+ pmulld m4, m13
+ paddd m4, m6 ; cost7[a-d] | cost5[a-d]
+
+ ; create aggregates [lower half]:
+ ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+ ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+ ; and [upper half]:
+ ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+ ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m5, m1, 2
+ psrldq m1, 14
+ pslldq m6, m2, 4
+ psrldq m2, 12
+ pslldq m7, m3, 6
+ psrldq m3, 10
+ paddw m5, m0
+ paddw m1, m2
+ paddw m6, m7
+ paddw m1, m3 ; partial_sum_alt[0/1] right
+ paddw m5, m6 ; partial_sum_alt[0/1] left
+ pshuflw m0, m1, q3012
+ punpckhwd m1, m5
+ punpcklwd m5, m0
+ pmaddwd m1, m1
+ pmaddwd m5, m5
+ pmulld m1, m12
+ pmulld m5, m13
+ paddd m5, m1 ; cost1[a-d] | cost3[a-d]
+
+ mova xm0, [pd_47130256+ 16]
+ mova m1, [pd_47130256]
+ phaddd m9, m8
+ phaddd m5, m4
+ phaddd m9, m5
+ vpermd m0, m9 ; cost[0-3]
+ vpermd m1, m9 ; cost[4-7] | cost[0-3]
+
+ ; now find the best cost
+ pmaxsd xm2, xm0, xm1
+ pshufd xm3, xm2, q1032
+ pmaxsd xm2, xm3
+ pshufd xm3, xm2, q2301
+ pmaxsd xm2, xm3 ; best cost
+
+ ; find the idx using minpos
+ ; make everything other than the best cost negative via subtraction
+ ; find the min of unsigned 16-bit ints to sort out the negative values
+ psubd xm4, xm1, xm2
+ psubd xm3, xm0, xm2
+ packssdw xm3, xm4
+ phminposuw xm3, xm3
+
+ ; convert idx to 32-bits
+ psrld xm3, 16
+ movd eax, xm3
+
+ ; get idx^4 complement
+ vpermd m3, m1
+ psubd xm2, xm3
+ psrld xm2, 10
+ movd [varq], xm2
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef_avx512.asm b/third_party/dav1d/src/x86/cdef_avx512.asm
new file mode 100644
index 0000000000..b4f9c008ca
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx512.asm
@@ -0,0 +1,860 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro DUP4 1-*
+ %rep %0
+ times 4 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DIRS 16 ; cdef_directions[]
+ %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1
+ ; masking away unused bits allows us to use a single vpaddd {1to16}
+ ; instruction instead of having to do vpbroadcastd + paddb
+ db %13 & 0x3f, -%13 & 0x3f
+ %rotate 1
+ %endrep
+%endmacro
+
+SECTION_RODATA 64
+
+lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13
+ db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
+ db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+ db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+ db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+ db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7
+lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55
+ db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21
+ db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25
+ db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53
+ db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57
+end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+ db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+ db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+ db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
+edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
+ dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
+ dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
+ dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
+ dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
+ dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
+ dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
+ dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
+px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
+cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15
+gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
+ dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
+ dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
+ dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4
+sec_tap: db 32, 32, 16, 16
+pd_268435568: dd 268435568
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 4
+%else
+DECLARE_REG_TMP 8
+%endif
+
+; lut:
+; t0 t1 t2 t3 t4 t5 t6 t7
+; T0 T1 T2 T3 T4 T5 T6 T7
+; L0 L1 00 01 02 03 04 05
+; L2 L3 10 11 12 13 14 15
+; L4 L5 20 21 22 23 24 25
+; L6 L7 30 31 32 33 34 35
+; b0 b1 b2 b3 b4 b5 b6 b7
+; B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r7-edge_mask
+ movq xmm0, [dstq+strideq*0]
+ movhps xmm0, [dstq+strideq*1]
+ lea r7, [edge_mask]
+ movq xmm1, [topq+strideq*0-2]
+ movhps xmm1, [topq+strideq*1-2]
+ mov r6d, edgem
+ vinserti32x4 ym0, ymm0, [leftq], 1
+ lea r2, [strideq*3]
+ vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1
+ mova m5, [base+lut_perm_4x4]
+ vinserti32x4 m0, [dstq+r2], 2
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m1, [botq+strideq*0-4], 2
+ vinserti32x4 m0, [botq+strideq*1-4], 3
+.main:
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m3, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m5, m0, m1 ; lut
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m7, m7
+ lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m6, m3, m5 ; px
+ cmp r6d, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1
+%macro CDEF_FILTER_4x4_PRI 0
+ vpcmpub k1, m6, m1, 6 ; px > pN
+ psubb m2, m1, m6
+ lzcnt r6d, prid
+ vpsubb m2{k1}, m6, m1 ; abs(diff)
+ vpbroadcastb m4, prid
+ and prid, 1
+ vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
+ movifnidn secd, secm
+ vpbroadcastd m10, [base+pri_tap+priq*4]
+ vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap)
+ psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift)))
+ pminub m2, m4
+ vpdpbusd m0, m2, m10 ; sum
+%endmacro
+ CDEF_FILTER_4x4_PRI
+ test secd, secd
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m4, m6, m1
+ pmaxub m1, m6
+ pminub m5, m2, m3
+ pmaxub m2, m3
+ pminub m4, m5
+ pmaxub m2, m1
+ psrldq m1, m4, 2
+ psrldq m3, m2, 2
+ pminub m1, m4
+ vpcmpw k1, m0, m7, 1
+ vpshldd m6, m0, 8
+ pmaxub m2, m3
+ pslldq m3, m1, 1
+ psubw m7, m0
+ paddusw m0, m6 ; clip >0xff
+ vpsubusw m0{k1}, m6, m7 ; clip <0x00
+ pslldq m4, m2, 1
+ pminub m1, m3
+ pmaxub m2, m4
+ pmaxub m0, m1
+ pminub m0, m2
+ jmp .end
+.sec_only:
+ movifnidn secd, secm
+ call .sec
+.end_no_clip:
+ vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+.end:
+ mova xm1, [base+end_perm]
+ vpermb m0, m1, m0 ; output in bits 8-15 of each dword
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ RET
+.mask_edges_sec_only:
+ movifnidn secd, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ vpbroadcastq m8, [base+edge_mask+r6*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m8, m2 ; index in-range
+ mova m1, m6
+ vpermb m1{k1}, m2, m5
+ CDEF_FILTER_4x4_PRI
+ test secd, secd
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m8, m4
+ mova m2, m6
+ vpermb m2{k1}, m4, m5
+ vpshufbitqmb k1, m8, m9
+ mova m3, m6
+ vpermb m3{k1}, m9, m5
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
+ vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
+.sec_main:
+ vpbroadcastd m8, [base+sec_tap]
+ vpcmpub k1, m6, m2, 6
+ psubb m4, m2, m6
+ vpbroadcastb m12, secd
+ lzcnt secd, secd
+ vpsubb m4{k1}, m6, m2
+ vpcmpub k2, m6, m3, 6
+ vpbroadcastq m11, [r3+secq*8]
+ gf2p8affineqb m10, m4, m11, 0
+ psubb m5, m3, m6
+ mova m9, m8
+ vpsubb m8{k1}, m7, m8
+ psubusb m10, m12, m10
+ vpsubb m5{k2}, m6, m3
+ pminub m4, m10
+ vpdpbusd m0, m4, m8
+ gf2p8affineqb m11, m5, m11, 0
+ vpsubb m9{k2}, m7, m9
+ psubusb m12, m11
+ pminub m5, m12
+ vpdpbusd m0, m5, m9
+ ret
+
+DECLARE_REG_TMP 2, 7
+
+; lut top lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ vpbroadcastd ym21, strided
+ mov r6d, edgem
+ lea r8, [edge_mask]
+ movq xm1, [topq+strideq*0-2]
+ pmulld ym21, [base+pd_01234567]
+ kxnorb k1, k1, k1
+ movq xm2, [topq+strideq*1-2]
+ vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7
+ mova m14, [base+lut_perm_4x8a]
+ movu m15, [base+lut_perm_4x8b]
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 ym1, [botq+strideq*0-2], 1
+ vinserti32x4 ym2, [botq+strideq*1-2], 1
+.main:
+ punpcklqdq ym1, ym2
+ vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m16, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m14, m0, m1 ; lut top
+ vpermi2b m15, m0, m1 ; lut bottom
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m20, m20
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m2, m16, m14 ; pxt
+ vpermb m3, m16, m15 ; pxb
+ mova m1, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1
+ vpermb m5, m6, m15 ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+ vpcmpub k1, m2, m4, 6 ; pxt > pNt
+ vpcmpub k2, m3, m5, 6 ; pxb > pNb
+ psubb m6, m4, m2
+ psubb m7, m5, m3
+ lzcnt r6d, prid
+ vpsubb m6{k1}, m2, m4 ; abs(diff_top)
+ vpsubb m7{k2}, m3, m5 ; abs(diff_bottom)
+ vpbroadcastb m13, prid
+ vpbroadcastq m9, [r3+r6*8]
+ and prid, 1
+ vpbroadcastd m11, [base+pri_tap+priq*4]
+ vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+ vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+ mova m10, m11
+ movifnidn t1d, secm
+ vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top)
+ vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom)
+ psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift)))
+ psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift)))
+ pminub m6, m12
+ pminub m7, m13
+ vpdpbusd m0, m6, m10 ; sum top
+ vpdpbusd m1, m7, m11 ; sum bottom
+%endmacro
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m10, m4, m2
+ pminub m12, m6, m8
+ pminub m11, m5, m3
+ pminub m13, m7, m9
+ pmaxub m4, m2
+ pmaxub m6, m8
+ pmaxub m5, m3
+ pmaxub m7, m9
+ pminub m10, m12
+ pminub m11, m13
+ pmaxub m4, m6
+ pmaxub m5, m7
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ kxnorb k2, k2, k2 ; hw lw
+ vpshrdd m12, m0, m1, 16 ; m1lw m0hw
+ vpshrdd m6, m10, m11, 16 ; m11lw m10hw
+ vpshrdd m8, m4, m5, 16 ; m5lw m4hw
+ vpblendmw m7{k1}, m10, m11 ; m11hw m10lw
+ vpblendmw m9{k1}, m4, m5 ; m5hw m4lw
+ vpblendmw m4{k1}, m0, m12 ; m1lw m0lw
+ vpblendmw m5{k1}, m12, m1 ; m1hw m0hw
+ vpshrdd m2, m3, 16
+ pminub m6, m7
+ pmaxub m8, m9
+ mova ym14, [base+end_perm]
+ vpcmpw k1, m4, m20, 1
+ vpshldw m2, m5, 8
+ pslldq m7, m6, 1
+ pslldq m9, m8, 1
+ psubw m5, m20, m4
+ paddusw m0, m4, m2 ; clip >0xff
+ pminub m6, m7
+ pmaxub m8, m9
+ psubusw m0{k1}, m2, m5 ; clip <0x00
+ pmaxub m0, m6
+ pminub m0, m8
+ vpermb m0, m14, m0
+ vpscatterdd [dstq+ym21]{k2}, ym0
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova ym4, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m3, m1, 8
+ paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m3
+ pslld m0, 16
+ vpshrdd m0, m1, 16
+ vpermb m0, m4, m0 ; output in bits 8-15 of each word
+ vpscatterdd [dstq+ym21]{k1}, ym0
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t1d, r6d
+ or r6d, 8 ; top 4x4 has bottom
+ or t1d, 4 ; bottom 4x4 has top
+ vpbroadcastq m17, [base+edge_mask+r6*8]
+ vpbroadcastq m18, [base+edge_mask+t1*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m17, m6 ; index in-range
+ vpshufbitqmb k2, m18, m6
+ mova m4, m2
+ mova m5, m3
+ vpermb m4{k1}, m6, m14
+ vpermb m5{k2}, m6, m15
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m17, m10
+ vpshufbitqmb k2, m18, m10
+ vpshufbitqmb k3, m17, m11
+ vpshufbitqmb k4, m18, m11
+ mova m6, m2
+ mova m7, m3
+ mova m8, m2
+ mova m9, m3
+ vpermb m6{k1}, m10, m14
+ vpermb m7{k2}, m10, m15
+ vpermb m8{k3}, m11, m14
+ vpermb m9{k4}, m11, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+ vpermb m7, m8, m15 ; pNb
+ vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+ vpermb m9, m9, m15 ; pNb
+.sec_main:
+ vpbroadcastb m18, t1d
+ lzcnt t1d, t1d
+ vpcmpub k1, m2, m6, 6
+ vpcmpub k2, m3, m7, 6
+ vpcmpub k3, m2, m8, 6
+ vpcmpub k4, m3, m9, 6
+ vpbroadcastq m17, [r3+t1*8]
+ psubb m10, m6, m2
+ psubb m11, m7, m3
+ psubb m12, m8, m2
+ psubb m13, m9, m3
+ vpsubb m10{k1}, m2, m6 ; abs(dt0)
+ vpsubb m11{k2}, m3, m7 ; abs(db0)
+ vpsubb m12{k3}, m2, m8 ; abs(dt1)
+ vpsubb m13{k4}, m3, m9 ; abs(db1)
+ vpbroadcastd m19, [base+sec_tap]
+ gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+ gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+ gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+ gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+ psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift)))
+ psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift)))
+ psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift)))
+ psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift)))
+ pminub m10, m14
+ pminub m11, m15
+ pminub m12, m16
+ pminub m13, m17
+ mova m14, m19
+ mova m15, m19
+ mova m16, m19
+ vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0)
+ vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0)
+ vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1)
+ vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1)
+ vpdpbusd m0, m10, m14
+ vpdpbusd m1, m11, m15
+ vpdpbusd m0, m12, m16
+ vpdpbusd m1, m13, m19
+ ret
+
+; lut tl lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb
+; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb
+; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09
+; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19
+; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29
+; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39
+; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49
+; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59
+; lut bl lut br
+; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29
+; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39
+; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49
+; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59
+; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69
+; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79
+; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb
+; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb
+
+cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ movu xm16, [dstq+strideq*0]
+ pinsrd xm16, [leftq+4*0], 3
+ mov r6d, edgem
+ vinserti128 ym16, [dstq+strideq*1], 1
+ lea r10, [dstq+strideq*4]
+ movu xm17, [dstq+strideq*2]
+ vinserti32x4 m16, [topq+strideq*0-2], 2
+ lea r9, [strideq*3]
+ pinsrd xm17, [leftq+4*1], 3
+ vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T
+ lea r8, [edge_mask]
+ vinserti128 ym17, [dstq+r9 ], 1
+ vpbroadcastd ym18, [leftq+4*2]
+ vpblendd ym17, ym18, 0x80
+ movu xm18, [r10 +strideq*2]
+ vinserti32x4 m17, [r10 +strideq*0], 2
+ pinsrd xm18, [leftq+4*3], 3
+ vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5
+ vinserti128 ym18, [r10 +r9 ], 1
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m18, [botq+strideq*0-2], 2
+ vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B
+.main:
+ mova m0, [base+lut_perm_8x8a]
+ movu m1, [base+lut_perm_8x8b]
+ mova m30, [base+px_idx]
+ vpermb m16, m0, m16
+ movifnidn prid, prim
+ vpermb m17, m1, m17
+ mov t0d, dirm
+ vpermb m18, m0, m18
+ mov r3d, dampingm
+ vshufi32x4 m12, m16, m17, q2020 ; lut tl
+ vshufi32x4 m13, m16, m17, q3131 ; lut tr
+ vshufi32x4 m14, m17, m18, q0220 ; lut bl
+ vshufi32x4 m15, m17, m18, q1331 ; lut br
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m31, m31
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m4, m30, m12 ; pxtl
+ mova m1, m0
+ vpermb m5, m30, m13 ; pxtr
+ mova m2, m0
+ vpermb m6, m30, m14 ; pxbl
+ mova m3, m0
+ vpermb m7, m30, m15 ; pxbr
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1
+ vpermb m9, m11, m13 ; pNtr
+ vpermb m10, m11, m14 ; pNbl
+ vpermb m11, m11, m15 ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+ vpcmpub k1, m4, m8, 6 ; pxtl > pNtl
+ vpcmpub k2, m5, m9, 6 ; pxtr > pNtr
+ vpcmpub k3, m6, m10, 6 ; pxbl > pNbl
+ vpcmpub k4, m7, m11, 6 ; pxbr > pNbr
+ psubb m16, m8, m4
+ psubb m17, m9, m5
+ psubb m18, m10, m6
+ psubb m19, m11, m7
+ lzcnt r6d, prid
+ vpsubb m16{k1}, m4, m8 ; abs(diff_tl)
+ vpsubb m17{k2}, m5, m9 ; abs(diff_tr)
+ vpsubb m18{k3}, m6, m10 ; abs(diff_bl)
+ vpsubb m19{k4}, m7, m11 ; abs(diff_br)
+ vpbroadcastq m28, [r3+r6*8]
+ vpbroadcastb m29, prid
+ and prid, 1
+ vpbroadcastd m27, [base+pri_tap+priq*4]
+ vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+ vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+ vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+ vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+ mova m24, m27
+ mova m25, m27
+ mova m26, m27
+ movifnidn t1d, secm
+ vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr)
+ vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr)
+ psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift)))
+ psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift)))
+ psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift)))
+ psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift)))
+ pminub m16, m20
+ pminub m17, m21
+ pminub m18, m22
+ pminub m19, m23
+ vpdpbusd m0, m16, m24 ; sum tl
+ vpdpbusd m1, m17, m25 ; sum tr
+ vpdpbusd m2, m18, m26 ; sum bl
+ vpdpbusd m3, m19, m27 ; sum br
+%endmacro
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m20, m8, m4
+ pminub m24, m12, m16
+ pminub m21, m9, m5
+ pminub m25, m13, m17
+ pminub m22, m10, m6
+ pminub m26, m14, m18
+ pminub m23, m11, m7
+ pminub m27, m15, m19
+ pmaxub m8, m4
+ pmaxub m12, m16
+ pmaxub m9, m5
+ pmaxub m13, m17
+ pmaxub m10, m6
+ pmaxub m14, m18
+ pmaxub m11, m7
+ pmaxub m15, m19
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ pmaxub m8, m12
+ pmaxub m9, m13
+ pmaxub m10, m14
+ pmaxub m11, m15
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ vpshrdd m24, m0, m1, 16
+ vpshrdd m25, m2, m3, 16
+ vpshrdd m12, m20, m21, 16
+ vpshrdd m14, m22, m23, 16
+ vpshrdd m16, m8, m9, 16
+ vpshrdd m18, m10, m11, 16
+ vpblendmw m13{k1}, m20, m21
+ vpblendmw m15{k1}, m22, m23
+ vpblendmw m17{k1}, m8, m9
+ vpblendmw m19{k1}, m10, m11
+ vpblendmw m20{k1}, m0, m24
+ vpblendmw m21{k1}, m24, m1
+ vpblendmw m22{k1}, m2, m25
+ vpblendmw m23{k1}, m25, m3
+ vpshrdd m4, m5, 16
+ vpshrdd m6, m7, 16
+ pminub m12, m13
+ pminub m14, m15
+ pmaxub m16, m17
+ pmaxub m18, m19
+ mova m8, [base+end_perm_clip]
+ vpcmpw k2, m20, m31, 1
+ vpcmpw k3, m22, m31, 1
+ vpshldw m4, m21, 8
+ vpshldw m6, m23, 8
+ kunpckdq k1, k1, k1
+ kxnorb k4, k4, k4
+ vpshrdw m11, m12, m14, 8
+ vpshrdw m15, m16, m18, 8
+ vpblendmb m13{k1}, m12, m14
+ vpblendmb m17{k1}, m16, m18
+ psubw m21, m31, m20
+ psubw m23, m31, m22
+ paddusw m0, m20, m4 ; clip >0xff
+ paddusw m1, m22, m6
+ pminub m11, m13
+ pmaxub m15, m17
+ psubusw m0{k2}, m4, m21 ; clip <0x00
+ psubusw m1{k3}, m6, m23
+ psrlw m0, 8
+ vmovdqu8 m0{k1}, m1
+ pmaxub m0, m11
+ pminub m0, m15
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, m0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movq [r10 +strideq*0], xm2
+ movq [r10 +strideq*2], xm3
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+r9 ], xm1
+ movhps [r10 +strideq*1], xm2
+ movhps [r10 +r9 ], xm3
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova xm8, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m5, m1, 8
+ vpshldd m6, m2, 8
+ vpshldd m7, m3, 8
+ paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ vpermb m0, m8, m0
+ vpermb m1, m8, m1
+ vpermb m2, m8, m2
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm0
+ movq [r10 +strideq*0], xm5
+ movq [r10 +strideq*2], xm2
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r9 ], xm0
+ movhps [r10 +strideq*1], xm5
+ movhps [r10 +r9 ], xm2
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t0d, r6d
+ mov t1d, r6d
+ or t0d, 0xA ; top-left 4x4 has bottom and right
+ or t1d, 0x9 ; top-right 4x4 has bottom and left
+ vpbroadcastq m26, [base+edge_mask+t0*8]
+ vpbroadcastq m27, [base+edge_mask+t1*8]
+ mov t1d, r6d
+ or r6d, 0x6 ; bottom-left 4x4 has top and right
+ or t1d, 0x5 ; bottom-right 4x4 has top and left
+ vpbroadcastq m28, [base+edge_mask+r6*8]
+ vpbroadcastq m29, [base+edge_mask+t1*8]
+ mov t0d, dirm
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m26, m20 ; index in-range
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m8, m4
+ mova m9, m5
+ mova m10, m6
+ mova m11, m7
+ vpermb m8{k1}, m20, m12
+ vpermb m9{k2}, m20, m13
+ vpermb m10{k3}, m20, m14
+ vpermb m11{k4}, m20, m15
+ mova [rsp+0x00], m26
+ mova [rsp+0x40], m27
+ mova [rsp+0x80], m28
+ mova [rsp+0xC0], m29
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ mova m26, [rsp+0x00]
+ mova m27, [rsp+0x40]
+ mova m28, [rsp+0x80]
+ mova m29, [rsp+0xC0]
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m26, m20
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m16, m4
+ mova m17, m5
+ mova m18, m6
+ mova m19, m7
+ vpermb m16{k1}, m20, m12
+ vpermb m17{k2}, m20, m13
+ vpermb m18{k3}, m20, m14
+ vpermb m19{k4}, m20, m15
+ vpshufbitqmb k1, m26, m21
+ vpshufbitqmb k2, m27, m21
+ vpshufbitqmb k3, m28, m21
+ vpshufbitqmb k4, m29, m21
+ vpermb m12, m21, m12
+ vpermb m13, m21, m13
+ vpermb m14, m21, m14
+ vpermb m15, m21, m15
+ vpblendmb m12{k1}, m4, m12
+ vpblendmb m13{k2}, m5, m13
+ vpblendmb m14{k3}, m6, m14
+ vpblendmb m15{k4}, m7, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+ vpermb m17, m20, m13 ; pNtr
+ vpermb m18, m20, m14 ; pNbl
+ vpermb m19, m20, m15 ; pNbr
+ vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+ vpermb m13, m21, m13 ; pNtr
+ vpermb m14, m21, m14 ; pNbl
+ vpermb m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+ vpcmpub k1, m4, %1, 6
+ vpcmpub k2, m5, %2, 6
+ vpcmpub k3, m6, %3, 6
+ vpcmpub k4, m7, %4, 6
+ psubb m20, %1, m4
+ psubb m21, %2, m5
+ psubb m22, %3, m6
+ psubb m23, %4, m7
+%if %5
+ vpbroadcastb m28, t1d
+ lzcnt t1d, t1d
+ vpbroadcastq m29, [r3+t1*8]
+%endif
+ vpsubb m20{k1}, m4, %1
+ vpsubb m21{k2}, m5, %2
+ vpsubb m22{k3}, m6, %3
+ vpsubb m23{k4}, m7, %4
+ gf2p8affineqb m24, m20, m29, 0
+ gf2p8affineqb m25, m21, m29, 0
+ gf2p8affineqb m26, m22, m29, 0
+ gf2p8affineqb m27, m23, m29, 0
+%if %5
+ vpbroadcastd m30, [base+sec_tap]
+%endif
+ psubusb m24, m28, m24
+ psubusb m25, m28, m25
+ psubusb m26, m28, m26
+ psubusb m27, m28, m27
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ mova m24, m30
+ mova m25, m30
+ mova m26, m30
+ mova m27, m30
+ vpsubb m24{k1}, m31, m30
+ vpsubb m25{k2}, m31, m30
+ vpsubb m26{k3}, m31, m30
+ vpsubb m27{k4}, m31, m30
+ vpdpbusd m0, m20, m24
+ vpdpbusd m1, m21, m25
+ vpdpbusd m2, m22, m26
+ vpdpbusd m3, m23, m27
+%endmacro
+ CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+ CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef_sse.asm b/third_party/dav1d/src/x86/cdef_sse.asm
new file mode 100644
index 0000000000..1b353121f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_sse.asm
@@ -0,0 +1,1357 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2019, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro DUP8 1-*
+ %rep %0
+ times 8 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+ dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+ dw 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105
+ dw 105, 105, 105, 105, 105, 105, 105, 105
+const shufw_6543210x, \
+ db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_8: times 8 dw 8
+pw_128: times 8 dw 128
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_0x7FFF: times 8 dw 0x7FFF
+pw_0x8000: times 8 dw 0x8000
+tap_table: ; masks for 8-bit shift emulation
+ DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
+ ; weights
+ DUP8 4, 2, 3, 3, 2, 1
+ ; taps indices
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+SECTION .text
+
+%macro movif32 2
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+ pmovzxbw %1, %2
+ %else
+ %if %3 == 1
+ movd %1, %2
+ %else
+ movq %1, %2
+ %endif
+ punpcklbw %1, m7
+ %endif
+%endmacro
+
+%macro PSHUFB_0 2
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ punpcklbw %1, %1
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+ %endif
+%endmacro
+
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movq %1, %2
+ punpcklqdq %1, %1
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1+14*8] ; off1
+ %if %6 == 4
+ movq m5, [stkq+offq*2+32*0] ; p0
+ movhps m5, [stkq+offq*2+32*1]
+ %else
+ movu m5, [stkq+offq*2+32*0] ; p0
+ %endif
+ neg offq ; -off1
+ %if %6 == 4
+ movq m6, [stkq+offq*2+32*0] ; p1
+ movhps m6, [stkq+offq*2+32*1]
+ %else
+ movu m6, [stkq+offq*2+32*0] ; p1
+ %endif
+ %if %7
+ %if cpuflag(sse4)
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5
+ pminuw m8, m5
+ pmaxsw m7, m6
+ pminuw m8, m6
+ %else
+ pcmpeqw m3, m14, m5
+ pminsw m8, m5 ; min after p0
+ pandn m3, m5
+ pmaxsw m7, m3 ; max after p0
+ pcmpeqw m3, m14, m6
+ pminsw m8, m6 ; min after p1
+ pandn m3, m6
+ pmaxsw m7, m3 ; max after p1
+ %endif
+ %endif
+
+ ; accumulate sum[m13] over p0/p1
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+ packsswb m5, m6 ; convert pixel diff to 8-bit
+ %if cpuflag(ssse3)
+ pshufb m5, m13 ; group diffs p0 and p1 into pairs
+ pabsb m6, m5
+ psignb m3, %5, m5
+ %else
+ movlhps m6, m5
+ punpckhbw m6, m5
+ pxor m5, m5
+ pcmpgtb m5, m6
+ paddb m6, m5
+ pxor m6, m5
+ paddb m3, %5, m5
+ pxor m3, m5
+ %endif
+ pand m9, %3, m6 ; emulate 8-bit shift
+ psrlw m9, %2
+ psubusb m5, %4, m9
+ pminub m5, m6 ; constrain(diff_p)
+ %if cpuflag(ssse3)
+ pmaddubsw m5, m3 ; constrain(diff_p) * taps
+ %else
+ psrlw m9, m5, 8
+ psraw m6, m3, 8
+ psllw m5, 8
+ psllw m3, 8
+ pmullw m9, m6
+ pmulhw m5, m3
+ paddw m5, m9
+ %endif
+ paddw m0, m5
+%endmacro
+
+%macro LOAD_BODY 3 ; dst, src, block_width
+ %if %3 == 4
+ PMOVZXBW m0, [%2+strideq*0]
+ PMOVZXBW m1, [%2+strideq*1]
+ PMOVZXBW m2, [%2+strideq*2]
+ PMOVZXBW m3, [%2+stride3q]
+ mova [%1+32*0], m0
+ mova [%1+32*1], m1
+ mova [%1+32*2], m2
+ mova [%1+32*3], m3
+ %else
+ movu m0, [%2+strideq*0]
+ movu m1, [%2+strideq*1]
+ movu m2, [%2+strideq*2]
+ movu m3, [%2+stride3q]
+ punpcklbw m4, m0, m7
+ punpckhbw m0, m7
+ mova [%1+32*0+ 0], m4
+ mova [%1+32*0+16], m0
+ punpcklbw m4, m1, m7
+ punpckhbw m1, m7
+ mova [%1+32*1+ 0], m4
+ mova [%1+32*1+16], m1
+ punpcklbw m4, m2, m7
+ punpckhbw m2, m7
+ mova [%1+32*2+ 0], m4
+ mova [%1+32*2+16], m2
+ punpcklbw m4, m3, m7
+ punpckhbw m3, m7
+ mova [%1+32*3+ 0], m4
+ mova [%1+32*3+16], m3
+ %endif
+%endmacro
+
+%macro CDEF_FILTER_END 2 ; w, minmax
+ pxor m6, m6
+ pcmpgtw m6, m0
+ paddw m0, m6
+ %if cpuflag(ssse3)
+ pmulhrsw m0, m15
+ %else
+ paddw m0, m15
+ psraw m0, 4
+ %endif
+ paddw m4, m0
+ %if %2
+ pminsw m4, m7
+ pmaxsw m4, m8
+ %endif
+ packuswb m4, m4
+ %if %1 == 4
+ movd [dstq+strideq*0], m4
+ psrlq m4, 32
+ movd [dstq+strideq*1], m4
+ add stkq, 32*2
+ lea dstq, [dstq+strideq*2]
+ %else
+ movq [dstq], m4
+ add stkq, 32
+ add dstq, strideq
+ %endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+ %if ARCH_X86_64
+cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \
+ dst, stride, left, top, bot, pri, dst4, edge, \
+ stride3
+ %define px rsp+3*16+2*32
+ %define base 0
+ %else
+cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+ dst, stride, left, edge, stride3
+ %define topq r2
+ %define botq r2
+ %define dst4q r2
+ LEA r5, tap_table
+ %define px esp+7*16+2*32
+ %define base r5-tap_table
+ %endif
+ mov edged, r9m
+ %if cpuflag(sse4)
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+ mova m6, OUT_OF_BOUNDS_MEM
+ pxor m7, m7
+
+ ; prepare pixel buffers - body/right
+ %if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+ %endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ LOAD_BODY px, dstq, %1
+ %if %2 == 8
+ LOAD_BODY px+4*32, dst4q, %1
+ %endif
+ jmp .body_done
+.no_right:
+ PMOVZXBW m0, [dstq+strideq*0], %1 == 4
+ PMOVZXBW m1, [dstq+strideq*1], %1 == 4
+ PMOVZXBW m2, [dstq+strideq*2], %1 == 4
+ PMOVZXBW m3, [dstq+stride3q ], %1 == 4
+ mova [px+32*0], m0
+ mova [px+32*1], m1
+ mova [px+32*2], m2
+ mova [px+32*3], m3
+ movd [px+32*0+%1*2], m6
+ movd [px+32*1+%1*2], m6
+ movd [px+32*2+%1*2], m6
+ movd [px+32*3+%1*2], m6
+ %if %2 == 8
+ PMOVZXBW m0, [dst4q+strideq*0], %1 == 4
+ PMOVZXBW m1, [dst4q+strideq*1], %1 == 4
+ PMOVZXBW m2, [dst4q+strideq*2], %1 == 4
+ PMOVZXBW m3, [dst4q+stride3q ], %1 == 4
+ mova [px+32*4], m0
+ mova [px+32*5], m1
+ mova [px+32*6], m2
+ mova [px+32*7], m3
+ movd [px+32*4+%1*2], m6
+ movd [px+32*5+%1*2], m6
+ movd [px+32*6+%1*2], m6
+ movd [px+32*7+%1*2], m6
+ %endif
+.body_done:
+
+ ; top
+ movifnidn topq, r3mp
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0-2]
+ PMOVZXBW m1, [topq+strideq*1-2]
+ %else
+ movu m0, [topq+strideq*0-4]
+ movu m1, [topq+strideq*1-4]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px-32*2+8], m2
+ movu [px-32*1+8], m3
+ %endif
+ movu [px-32*2-%1], m0
+ movu [px-32*1-%1], m1
+ jmp .top_done
+.top_no_right:
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0-%1]
+ PMOVZXBW m1, [topq+strideq*1-%1]
+ movu [px-32*2-8], m0
+ movu [px-32*1-8], m1
+ %else
+ movu m0, [topq+strideq*0-%1]
+ movu m1, [topq+strideq*1-%2]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px-32*2-16], m0
+ mova [px-32*2+ 0], m2
+ mova [px-32*1-16], m1
+ mova [px-32*1+ 0], m3
+ %endif
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0]
+ PMOVZXBW m1, [topq+strideq*1]
+ %else
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movd [px-32*2+16], m2
+ movd [px-32*1+16], m3
+ %endif
+ movd [px-32*2- 4], m6
+ movd [px-32*1- 4], m6
+ mova [px-32*2+ 0], m0
+ mova [px-32*1+ 0], m1
+ jmp .top_done
+.top_no_left_right:
+ PMOVZXBW m0, [topq+strideq*0], %1 == 4
+ PMOVZXBW m1, [topq+strideq*1], %1 == 4
+ movd [px-32*2-4], m6
+ movd [px-32*1-4], m6
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
+ jmp .top_done
+.no_top:
+ movu [px-32*2- 4], m6
+ movu [px-32*1- 4], m6
+ %if %1 == 8
+ movq [px-32*2+12], m6
+ movq [px-32*1+12], m6
+ %endif
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ movifnidn leftq, leftmp
+ %if %2 == 4
+ movq m0, [leftq]
+ %else
+ movu m0, [leftq]
+ %endif
+ %if %2 == 4
+ punpcklbw m0, m7
+ %else
+ punpckhbw m1, m0, m7
+ punpcklbw m0, m7
+ movhlps m3, m1
+ movd [px+32*4-4], m1
+ movd [px+32*6-4], m3
+ psrlq m1, 32
+ psrlq m3, 32
+ movd [px+32*5-4], m1
+ movd [px+32*7-4], m3
+ %endif
+ movhlps m2, m0
+ movd [px+32*0-4], m0
+ movd [px+32*2-4], m2
+ psrlq m0, 32
+ psrlq m2, 32
+ movd [px+32*1-4], m0
+ movd [px+32*3-4], m2
+ jmp .left_done
+.no_left:
+ movd [px+32*0-4], m6
+ movd [px+32*1-4], m6
+ movd [px+32*2-4], m6
+ movd [px+32*3-4], m6
+ %if %2 == 8
+ movd [px+32*4-4], m6
+ movd [px+32*5-4], m6
+ movd [px+32*6-4], m6
+ movd [px+32*7-4], m6
+ %endif
+.left_done:
+
+ ; bottom
+ movifnidn botq, r4mp
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0-(%1/2)]
+ PMOVZXBW m1, [botq+strideq*1-(%1/2)]
+ %else
+ movu m0, [botq+strideq*0-4]
+ movu m1, [botq+strideq*1-4]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px+32*(%2+0)+8], m2
+ movu [px+32*(%2+1)+8], m3
+ %endif
+ movu [px+32*(%2+0)-%1], m0
+ movu [px+32*(%2+1)-%1], m1
+ jmp .bottom_done
+.bottom_no_right:
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0-4]
+ PMOVZXBW m1, [botq+strideq*1-4]
+ movu [px+32*(%2+0)-8], m0
+ movu [px+32*(%2+1)-8], m1
+ %else
+ movu m0, [botq+strideq*0-8]
+ movu m1, [botq+strideq*1-8]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)-16], m0
+ mova [px+32*(%2+0)+ 0], m2
+ mova [px+32*(%2+1)-16], m1
+ mova [px+32*(%2+1)+ 0], m3
+ movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
+ %endif
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0]
+ PMOVZXBW m1, [botq+strideq*1]
+ %else
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)+16], m2
+ mova [px+32*(%2+1)+16], m3
+ %endif
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
+ jmp .bottom_done
+.bottom_no_left_right:
+ PMOVZXBW m0, [botq+strideq*0], %1 == 4
+ PMOVZXBW m1, [botq+strideq*1], %1 == 4
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
+ jmp .bottom_done
+.no_bottom:
+ movu [px+32*(%2+0)- 4], m6
+ movu [px+32*(%2+1)- 4], m6
+ %if %1 == 8
+ movq [px+32*(%2+0)+12], m6
+ movq [px+32*(%2+1)+12], m6
+ %endif
+.bottom_done:
+
+ ; actual filter
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec
+ mova m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+ mova m15, [pw_2048]
+ %else
+ mova m15, [pw_8]
+ %endif
+ mova m14, m6
+ %else
+ DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+ %xdefine m8 m1
+ %xdefine m9 m2
+ %xdefine m10 m0
+ %xdefine m13 [base+shufb_lohi]
+ %xdefine m14 OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+ %xdefine m15 [base+pw_2048]
+ %else
+ %xdefine m15 [base+pw_8]
+ %endif
+ %endif
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dampingd, r8m
+ movif32 [esp+0x3C], r1d
+ test prid, prid
+ jz .sec_only
+ movd m1, r5m
+ bsr pridmpd, prid
+ test secd, secd
+ jz .pri_only
+ movd m10, r6m
+ tzcnt secd, secd
+ and prid, 1
+ sub pridmpd, dampingd
+ sub secd, dampingd
+ xor dampingd, dampingd
+ add prid, prid
+ neg pridmpd
+ cmovs pridmpd, dampingd
+ neg secd
+ PSHUFB_0 m1, m7
+ PSHUFB_0 m10, m7
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec
+ lea tapq, [tap_table]
+ MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask
+ MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask
+ mov [rsp+0x00], pridmpq ; pri_shift
+ mov [rsp+0x10], secq ; sec_shift
+ DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off
+ %else
+ MOVDDUP m2, [tapq+pridmpq*8]
+ MOVDDUP m3, [tapq+secq*8]
+ mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw
+ mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP
+ mov [esp+0x00], pridmpd
+ mov [esp+0x30], secd
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %define offq dstq
+ %define kd strided
+ %define kq strideq
+ mova [esp+0x10], m2
+ mova [esp+0x40], m3
+ mova [esp+0x20], m1
+ mova [esp+0x50], m10
+ %endif
+ mov dird, r7m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8] ; pri_taps
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0] ; px
+ %endif
+ pxor m0, m0 ; sum
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+.k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ %if ARCH_X86_64
+ ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+ ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
+ %else
+ ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ %endif
+ dec kd
+ jge .k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 1
+ dec hd
+ jg .v_loop
+ RET
+
+.pri_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap
+ lea tapq, [tap_table]
+ %else
+ DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
+ %endif
+ and prid, 1
+ xor zerod, zerod
+ sub dampingd, pridmpd
+ cmovs dampingd, zerod
+ add prid, prid
+ PSHUFB_0 m1, m7
+ MOVDDUP m7, [tapq+dampingq*8]
+ mov [rsp+0x00], dampingq
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off
+ %else
+ mov [rsp+0x04], zerod
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %endif
+ mov dird, r7m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8]
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.pri_v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0]
+ %endif
+ pxor m0, m0
+.pri_k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .pri_k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .pri_v_loop
+ RET
+
+.sec_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec
+%else
+ DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+ movd m1, r6m
+ tzcnt secd, secd
+ mov dird, r7m
+ xor zerod, zerod
+ sub dampingd, secd
+ cmovs dampingd, zerod
+ PSHUFB_0 m1, m7
+ %if ARCH_X86_64
+ lea tapq, [tap_table]
+ %else
+ mov [rsp+0x04], zerod
+ %endif
+ mov [rsp+0x00], dampingq
+ MOVDDUP m7, [tapq+dampingq*8]
+ lea dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k
+ %else
+ DEFINE_ARGS dst, stride, off, stk, dir, tap, h
+ %endif
+ lea stkq, [px]
+ mov hd, %1*%2/8
+.sec_v_loop:
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0]
+ %endif
+ pxor m0, m0
+.sec_k_loop:
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ %endif
+ ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .sec_k_loop
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .sec_v_loop
+ RET
+%endmacro
+
+%macro MULLD 2
+ %if cpuflag(sse4)
+ pmulld %1, %2
+ %else
+ %if ARCH_X86_32
+ %define m15 m1
+ %endif
+ pmulhuw m15, %1, %2
+ pmullw %1, %2
+ pslld m15, 16
+ paddd %1, m15
+ %endif
+%endmacro
+
+%macro CDEF_DIR 0
+ %if ARCH_X86_64
+cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
+ lea r6, [strideq*3]
+ movq m1, [srcq+strideq*0]
+ movhps m1, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ movhps m3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+r6 ]
+
+ pxor m8, m8
+ psadbw m9, m1, m8
+ psadbw m2, m3, m8
+ psadbw m4, m5, m8
+ psadbw m6, m7, m8
+ packssdw m9, m2
+ packssdw m4, m6
+ packssdw m9, m4
+
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ punpcklbw m2, m3, m8
+ punpckhbw m3, m8
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ punpcklbw m6, m7, m8
+ punpckhbw m7, m8
+cglobal_label .main
+ mova m8, [pw_128]
+ psubw m0, m8
+ psubw m1, m8
+ psubw m2, m8
+ psubw m3, m8
+ psubw m4, m8
+ psubw m5, m8
+ psubw m6, m8
+ psubw m7, m8
+ psllw m8, 3
+ psubw m9, m8 ; partial_sum_hv[0]
+
+ paddw m8, m0, m1
+ paddw m10, m2, m3
+ paddw m8, m4
+ paddw m10, m5
+ paddw m8, m6
+ paddw m10, m7
+ paddw m8, m10 ; partial_sum_hv[1]
+
+ pmaddwd m8, m8
+ pmaddwd m9, m9
+ phaddd m9, m8
+ SWAP m8, m9
+ MULLD m8, [div_table%+SUFFIX+48]
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m0
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14 ; partial_sum_diag[0] top/right half
+ paddw m9, m11 ; partial_sum_diag[0] top/left half
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13 ; partial_sum_diag[0][0-7]
+ paddw m10, m14 ; partial_sum_diag[0][8-14,zero]
+ pshufb m10, [shufw_6543210x]
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ MULLD m11, [div_table%+SUFFIX+16]
+ MULLD m9, [div_table%+SUFFIX+0]
+ paddd m9, m11 ; cost[0a-d]
+
+ pslldq m10, m0, 14
+ psrldq m11, m0, 2
+ pslldq m12, m1, 12
+ psrldq m13, m1, 4
+ pslldq m14, m2, 10
+ psrldq m15, m2, 6
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m3, 8
+ psrldq m13, m3, 8
+ pslldq m14, m4, 6
+ psrldq m15, m4, 10
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m5, 4
+ psrldq m13, m5, 12
+ pslldq m14, m6, 2
+ psrldq m15, m6, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15 ; partial_sum_diag[1][8-14,zero]
+ paddw m10, m7 ; partial_sum_diag[1][0-7]
+ pshufb m11, [shufw_6543210x]
+ punpckhwd m12, m10, m11
+ punpcklwd m10, m11
+ pmaddwd m12, m12
+ pmaddwd m10, m10
+ MULLD m12, [div_table%+SUFFIX+16]
+ MULLD m10, [div_table%+SUFFIX+0]
+ paddd m10, m12 ; cost[4a-d]
+ phaddd m9, m10 ; cost[0a/b,4a/b]
+
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+ pslldq m4, m11, 2
+ psrldq m5, m11, 14
+ pslldq m6, m12, 4
+ psrldq m7, m12, 12
+ pslldq m14, m13, 6
+ psrldq m15, m13, 10
+ paddw m4, m10
+ paddw m5, m7
+ paddw m4, m6
+ paddw m5, m15 ; partial_sum_alt[3] right
+ paddw m4, m14 ; partial_sum_alt[3] left
+ pshuflw m6, m5, q3012
+ punpckhwd m5, m4
+ punpcklwd m4, m6
+ pmaddwd m5, m5
+ pmaddwd m4, m4
+ MULLD m5, [div_table%+SUFFIX+48]
+ MULLD m4, [div_table%+SUFFIX+32]
+ paddd m4, m5 ; cost[7a-d]
+
+ pslldq m5, m10, 6
+ psrldq m6, m10, 10
+ pslldq m7, m11, 4
+ psrldq m10, m11, 12
+ pslldq m11, m12, 2
+ psrldq m12, 14
+ paddw m5, m7
+ paddw m6, m10
+ paddw m5, m11
+ paddw m6, m12
+ paddw m5, m13
+ pshuflw m7, m6, q3012
+ punpckhwd m6, m5
+ punpcklwd m5, m7
+ pmaddwd m6, m6
+ pmaddwd m5, m5
+ MULLD m6, [div_table%+SUFFIX+48]
+ MULLD m5, [div_table%+SUFFIX+32]
+ paddd m5, m6 ; cost[5a-d]
+
+ pslldq m6, m1, 2
+ psrldq m7, m1, 14
+ pslldq m10, m2, 4
+ psrldq m11, m2, 12
+ pslldq m12, m3, 6
+ psrldq m13, m3, 10
+ paddw m6, m0
+ paddw m7, m11
+ paddw m6, m10
+ paddw m7, m13 ; partial_sum_alt[3] right
+ paddw m6, m12 ; partial_sum_alt[3] left
+ pshuflw m10, m7, q3012
+ punpckhwd m7, m6
+ punpcklwd m6, m10
+ pmaddwd m7, m7
+ pmaddwd m6, m6
+ MULLD m7, [div_table%+SUFFIX+48]
+ MULLD m6, [div_table%+SUFFIX+32]
+ paddd m6, m7 ; cost[1a-d]
+
+ pshufd m0, m0, q1032
+ pshufd m1, m1, q1032
+ pshufd m2, m2, q1032
+ pshufd m3, m3, q1032
+
+ pslldq m10, m0, 6
+ psrldq m11, m0, 10
+ pslldq m12, m1, 4
+ psrldq m13, m1, 12
+ pslldq m14, m2, 2
+ psrldq m2, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m2
+ paddw m10, m3
+ pshuflw m12, m11, q3012
+ punpckhwd m11, m10
+ punpcklwd m10, m12
+ pmaddwd m11, m11
+ pmaddwd m10, m10
+ MULLD m11, [div_table%+SUFFIX+48]
+ MULLD m10, [div_table%+SUFFIX+32]
+ paddd m10, m11 ; cost[3a-d]
+
+ phaddd m9, m8 ; cost[0,4,2,6]
+ phaddd m6, m10
+ phaddd m5, m4
+ phaddd m6, m5 ; cost[1,3,5,7]
+ pshufd m4, m9, q3120
+
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m9, m6
+ pshufd m0, m9, q1032
+ pmaxsd m0, m9
+ pshufd m1, m0, q2301
+ pmaxsd m0, m1 ; best cost
+ %else
+ pcmpgtd m0, m9, m6
+ pand m9, m0
+ pandn m0, m6
+ por m9, m0
+ pshufd m1, m9, q1032
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m9, m0
+ pshufd m1, m9, q2301
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m0, m9
+ %endif
+
+ ; get direction and variance
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ psubd m2, m0, m1
+ psubd m3, m0, m4
+%if WIN64
+ WIN64_RESTORE_XMM
+ %define tmp rsp+stack_offset+8
+%else
+ %define tmp rsp-40
+%endif
+ mova [tmp+0x00], m2 ; emulate ymm in stack
+ mova [tmp+0x10], m3
+ pcmpeqd m1, m0 ; compute best cost mask
+ pcmpeqd m4, m0
+ packssdw m4, m1
+ pmovmskb eax, m4 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [varq], r1d
+ %else
+cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+ LEA r2, shufw_6543210x
+ pxor m0, m0
+ lea stride3q, [strideq*3]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ mova m1, [base+pw_128]
+ psadbw m2, m5, m0
+ psadbw m3, m7, m0
+ packssdw m2, m3
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+
+ mova [esp+0x00], m4
+ mova [esp+0x10], m5
+ mova [esp+0x20], m6
+ mova [esp+0x50], m7
+
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ psadbw m3, m5, m0
+ psadbw m0, m7
+ packssdw m3, m0
+ pxor m0, m0
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+cglobal_label .main
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+ packssdw m2, m3
+ psllw m1, 3
+ psubw m2, m1 ; partial_sum_hv[0]
+ pmaddwd m2, m2
+
+ mova m3, [esp+0x50]
+ mova m0, [esp+0x00]
+ paddw m0, [esp+0x10]
+ paddw m1, m3, [esp+0x20]
+ paddw m0, m4
+ paddw m1, m5
+ paddw m0, m6
+ paddw m1, m7
+ paddw m0, m1 ; partial_sum_hv[1]
+ pmaddwd m0, m0
+
+ phaddd m2, m0
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ mova [esp+0x30], m2
+
+ mova m1, [esp+0x10]
+ pslldq m0, m1, 2
+ psrldq m1, 14
+ paddw m0, [esp+0x00]
+ pslldq m2, m3, 6
+ psrldq m3, 10
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 4
+ psrldq m3, 12
+ paddw m0, m2 ; partial_sum_diag[0] top/left half
+ paddw m1, m3 ; partial_sum_diag[0] top/right half
+ pslldq m2, m4, 8
+ psrldq m3, m4, 8
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 10
+ psrldq m3, m5, 6
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 12
+ psrldq m3, m6, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m7, 14
+ psrldq m3, m7, 2
+ paddw m0, m2 ; partial_sum_diag[0][0-7]
+ paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [base+shufw_6543210x]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
+ paddd m0, m2 ; cost[0a-d]
+ mova [esp+0x40], m0
+
+ mova m1, [esp+0x00]
+ pslldq m0, m1, 14
+ psrldq m1, 2
+ paddw m0, m7
+ pslldq m2, m3, 8
+ psrldq m3, 8
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 10
+ psrldq m3, 6
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x10]
+ pslldq m2, m3, 12
+ psrldq m3, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m4, 6
+ psrldq m3, m4, 10
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 4
+ psrldq m3, m5, 12
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 2
+ psrldq m3, m6, 14
+ paddw m0, m2 ; partial_sum_diag[1][0-7]
+ paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [base+shufw_6543210x]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
+ paddd m0, m2 ; cost[4a-d]
+ phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
+ phaddd m1, [esp+0x30] ; cost[0,4,2,6]
+ mova [esp+0x30], m1
+
+ phaddw m0, [esp+0x00], m4
+ phaddw m1, [esp+0x10], m5
+ paddw m4, m5
+ mova m2, [esp+0x20]
+ paddw m5, m2, m3
+ phaddw m2, m6
+ paddw m6, m7
+ phaddw m3, m7
+ mova m7, [esp+0x00]
+ paddw m7, [esp+0x10]
+ mova [esp+0x00], m0
+ mova [esp+0x10], m1
+ mova [esp+0x20], m2
+
+ pslldq m1, m4, 4
+ pslldq m2, m6, 6
+ pslldq m0, m5, 2
+ paddw m1, m2
+ paddw m0, m7
+ psrldq m2, m5, 14
+ paddw m0, m1 ; partial_sum_alt[3] left
+ psrldq m1, m4, 12
+ paddw m1, m2
+ psrldq m2, m6, 10
+ paddw m1, m2 ; partial_sum_alt[3] right
+ pshuflw m1, m1, q3012
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m2 ; cost[7a-d]
+ mova [esp+0x40], m0
+
+ pslldq m0, m7, 6
+ psrldq m7, 10
+ pslldq m1, m5, 4
+ psrldq m5, 12
+ pslldq m2, m4, 2
+ psrldq m4, 14
+ paddw m0, m6
+ paddw m7, m5
+ paddw m0, m1
+ paddw m7, m4
+ paddw m0, m2
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
+ pmaddwd m0, m0
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m7 ; cost[5a-d]
+ mova [esp+0x50], m0
+
+ mova m7, [esp+0x10]
+ mova m2, [esp+0x20]
+ pslldq m0, m7, 2
+ psrldq m7, 14
+ pslldq m4, m2, 4
+ psrldq m2, 12
+ pslldq m5, m3, 6
+ psrldq m6, m3, 10
+ paddw m0, [esp+0x00]
+ paddw m7, m2
+ paddw m4, m5
+ paddw m7, m6 ; partial_sum_alt[3] right
+ paddw m0, m4 ; partial_sum_alt[3] left
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
+ pmaddwd m0, m0
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m7 ; cost[1a-d]
+ SWAP m0, m4
+
+ pshufd m0, [esp+0x00], q1032
+ pshufd m1, [esp+0x10], q1032
+ pshufd m2, [esp+0x20], q1032
+ pshufd m3, m3, q1032
+ mova [esp+0x00], m4
+
+ pslldq m4, m0, 6
+ psrldq m0, 10
+ pslldq m5, m1, 4
+ psrldq m1, 12
+ pslldq m6, m2, 2
+ psrldq m2, 14
+ paddw m4, m3
+ paddw m0, m1
+ paddw m5, m6
+ paddw m0, m2
+ paddw m4, m5
+ pshuflw m2, m0, q3012
+ punpckhwd m0, m4
+ punpcklwd m4, m2
+ pmaddwd m0, m0
+ pmaddwd m4, m4
+ MULLD m0, [base+div_table%+SUFFIX+48]
+ MULLD m4, [base+div_table%+SUFFIX+32]
+ paddd m4, m0 ; cost[3a-d]
+
+ mova m1, [esp+0x00]
+ mova m2, [esp+0x50]
+ mova m0, [esp+0x30] ; cost[0,4,2,6]
+ phaddd m1, m4
+ phaddd m2, [esp+0x40] ; cost[1,3,5,7]
+ phaddd m1, m2
+ pshufd m2, m0, q3120
+
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m0, m1
+ pshufd m3, m0, q1032
+ pmaxsd m3, m0
+ pshufd m0, m3, q2301
+ pmaxsd m0, m3
+ %else
+ pcmpgtd m3, m0, m1
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+ pshufd m4, m0, q1032
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ pshufd m4, m0, q2301
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ %endif
+
+ ; get direction and variance
+ mov vard, varm
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m1, m0, m3
+ psubd m4, m0, m2
+ mova [esp+0x00], m1 ; emulate ymm in stack
+ mova [esp+0x10], m4
+ pcmpeqd m3, m0 ; compute best cost mask
+ pcmpeqd m2, m0
+ packssdw m2, m3
+ pmovmskb eax, m2 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [vard], r1d
+ %endif
+
+ RET
+%endmacro
+
+INIT_XMM sse4
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+CDEF_DIR
+
+INIT_XMM ssse3
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+CDEF_DIR
+
+INIT_XMM sse2
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
diff --git a/third_party/dav1d/src/x86/cpu.c b/third_party/dav1d/src/x86/cpu.c
new file mode 100644
index 0000000000..764d8be8ef
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpu.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/x86/cpu.h"
+
+typedef struct {
+ uint32_t eax, ebx, edx, ecx;
+} CpuidRegisters;
+
+void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
+uint64_t dav1d_cpu_xgetbv(unsigned xcr);
+
+#define X(reg, mask) (((reg) & (mask)) == (mask))
+
+COLD unsigned dav1d_get_cpu_flags_x86(void) {
+ union {
+ CpuidRegisters r;
+ struct {
+ uint32_t max_leaf;
+ char vendor[12];
+ };
+ } cpu;
+ dav1d_cpu_cpuid(&cpu.r, 0, 0);
+ unsigned flags = 0;
+
+ if (cpu.max_leaf >= 1) {
+ CpuidRegisters r;
+ dav1d_cpu_cpuid(&r, 1, 0);
+ const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0);
+ const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
+
+ if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+ if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3;
+ if (X(r.ecx, 0x00080000)) /* SSE4.1 */
+ flags |= DAV1D_X86_CPU_FLAG_SSE41;
+ }
+ }
+#if ARCH_X86_64
+ /* We only support >128-bit SIMD on x86-64. */
+ if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
+ const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
+ if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
+ if (cpu.max_leaf >= 7) {
+ dav1d_cpu_cpuid(&r, 7, 0);
+ if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
+ flags |= DAV1D_X86_CPU_FLAG_AVX2;
+ if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
+ if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
+ }
+ }
+ }
+ }
+ }
+#endif
+ if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
+ if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 ||
+ (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60)))))
+ {
+ /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */
+ flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
+ }
+ }
+ }
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/x86/cpu.h b/third_party/dav1d/src/x86/cpu.h
new file mode 100644
index 0000000000..8529c77c9b
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpu.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_CPU_H
+#define DAV1D_SRC_X86_CPU_H
+
+enum CpuFlags {
+ DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
+ DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
+ DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
+ DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
+ DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
+ * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
+ DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough
+ * to cause performance regressions. */
+};
+
+unsigned dav1d_get_cpu_flags_x86(void);
+
+#endif /* DAV1D_SRC_X86_CPU_H */
diff --git a/third_party/dav1d/src/x86/cpuid.asm b/third_party/dav1d/src/x86/cpuid.asm
new file mode 100644
index 0000000000..e1d9228660
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpuid.asm
@@ -0,0 +1,55 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION .text
+
+cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
+ mov r4, regsmp
+ mov eax, leafm
+ mov ecx, subleafm
+%if ARCH_X86_64
+ mov r5, rbx
+%endif
+ cpuid
+ mov [r4+4*0], eax
+ mov [r4+4*1], ebx
+ mov [r4+4*2], edx
+ mov [r4+4*3], ecx
+%if ARCH_X86_64
+ mov rbx, r5
+%endif
+ RET
+
+cglobal cpu_xgetbv, 0, 0, 0, xcr
+ movifnidn ecx, xcrm
+ xgetbv
+%if ARCH_X86_64
+ shl rdx, 32
+ or rax, rdx
+%endif
+ RET
diff --git a/third_party/dav1d/src/x86/filmgrain.h b/third_party/dav1d/src/x86/filmgrain.h
new file mode 100644
index 0000000000..eeaa328d1e
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+
+#define decl_fg_fns(ext) \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
+
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
+
+static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
new file mode 100644
index 0000000000..a1d4c41f27
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -0,0 +1,2248 @@
+; Copyright © 2021-2022, VideoLAN and dav1d authors
+; Copyright © 2021-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
+gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_27_17_17_27: dw 27, 17, 17, 27
+pw_23_22: dw 23, 22, 0, 32
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+gen_ar0_shift: times 4 db 128
+ times 4 db 64
+ times 4 db 32
+ times 4 db 16
+pd_16: dd 16
+pd_m65536: dd -65536
+pb_1: times 4 db 1
+grain_max: times 2 dw 511
+ times 2 dw 2047
+grain_min: times 2 dw -512
+ times 2 dw -2048
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+uv_offset_mul: dd 256
+ dd 1024
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16, 8
+round_vals: dw 32, 64, 128, 256, 512, 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_YMM avx2
+cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax
+%define base r4-generate_grain_y_16bpc_avx2_table
+ lea r4, [generate_grain_y_16bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ mov r3, -73*82*2
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ lea r7d, [bdmaxq+1]
+ movq xm4, [base+mul_bits]
+ shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc
+ movq xm5, [base+hmul_bits]
+ sub r6, r7
+ mova xm6, [base+pb_mask]
+ sub bufq, r3
+ vpbroadcastw xm7, [base+round+r6*2-2]
+ lea r6, [gaussian_sequence]
+ movsxd r5, [r4+r5*4]
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm3, xm2 ; 4 next output seeds
+ pshuflw xm0, xm3, q3333
+ psrlw xm3, 5
+ pand xm2, xm0, xm1
+ movq r7, xm3
+ psrlw xm3, xm2, 10
+ por xm2, xm3
+ pmullw xm2, xm4
+ pmulhuw xm0, xm5
+ movzx r8d, r7w
+ pshufb xm3, xm6, xm2
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm0, xm2
+ movd xm2, [r6+r8*2]
+ rorx r8, r7, 32
+ por xm3, xm0
+ shr r7d, 16
+ pinsrw xm2, [r6+r7*2], 1
+ pshuflw xm0, xm3, q3333
+ movzx r7d, r8w
+ psrlw xm3, 5
+ pinsrw xm2, [r6+r7*2], 2
+ shr r8d, 16
+ movq r7, xm3
+ pinsrw xm2, [r6+r8*2], 3
+ movzx r8d, r7w
+ pinsrw xm2, [r6+r8*2], 4
+ rorx r8, r7, 32
+ shr r7d, 16
+ pinsrw xm2, [r6+r7*2], 5
+ movzx r7d, r8w
+ pinsrw xm2, [r6+r7*2], 6
+ shr r8d, 16
+ pinsrw xm2, [r6+r8*2], 7
+ paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0
+ pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support
+ mova [bufq+r3], xm2
+ add r3, 8*2
+ jl .loop
+
+ ; auto-regression code
+ add r5, r4
+ jmp r5
+
+.ar1:
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_y]
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+ pinsrb xm4, [base+pb_1], 3
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu xm0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq xm2, xm0, 2 ; top
+ psrldq xm1, xm0, 4 ; top/right
+ punpcklwd xm0, xm2
+ punpcklwd xm1, xm3
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11
+ vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
+ vpbroadcastw xm10, [base+round_vals-12+shiftq*2]
+ pxor m1, m1
+ punpcklwd xm10, xm1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1 ; cf5-11,0-4
+ vpermq m1, m0, q3333 ; cf4
+ vbroadcasti128 m11, [base+gen_shufA]
+ pshufd m6, m0, q0000 ; cf[5,6], cf[0-1]
+ vbroadcasti128 m12, [base+gen_shufB]
+ pshufd m7, m0, q1111 ; cf[7,8], cf[2-3]
+ punpckhwd xm1, xm0
+ pshufhw xm9, xm0, q2121
+ pshufd xm8, xm1, q0000 ; cf[4,9]
+ sar bdmaxd, 1
+ punpckhqdq xm9, xm9 ; cf[10,11]
+ movd xm4, bdmaxd ; max_grain
+ pcmpeqd xm5, xm5
+ sub bufq, 2*(82*73-(82*3+79))
+ pxor xm5, xm4 ; min_grain
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+.x_loop_ar2:
+ vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5]
+ pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ pmaddwd m0, m6
+ punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5]
+ pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, m7
+ pmaddwd xm2, xm8
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm10
+ paddd xm2, xm0
+ movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ paddd xm2, xm1
+ pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd xm3, xm9, xm0
+ psrldq xm0, 2
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; skip packssdw because we only care about one value
+ paddd xm3, xm1
+ pminsd xm3, xm4
+ psrldq xm1, 4
+ pmaxsd xm3, xm5
+ pextrw [bufq+xq*2], xm3, 0
+ punpcklwd xm3, xm3
+ pblendw xm0, xm3, 0010b
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ sar bdmaxd, 1
+ movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6
+ movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13
+ pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1
+ movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23
+ vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13
+ vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20
+ vpbroadcastw xm11, [base+round_vals+shiftq*2-12]
+ movd xm12, bdmaxd ; max_grain
+ punpcklbw m7, m7 ; sign-extension
+ punpcklbw m0, m0 ; sign-extension
+ punpcklbw xm1, xm1
+ REPX {psraw x, 8}, m7, m0, xm1
+ pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8]
+ pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10]
+ pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12]
+ pshufd xm7, xm7, q3333 ; cf[6,13]
+ pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18]
+ pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20]
+ paddw xm0, xm11, xm11
+ pcmpeqd xm13, xm13
+ pblendw xm10, xm1, xm0, 00001000b
+ pxor xm13, xm12 ; min_grain
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+.x_loop_ar3:
+ movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
+ movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8]
+ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
+ palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5]
+ palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6]
+ punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ pmaddwd m0, m4
+ pmaddwd m2, m6
+ pmaddwd m3, m5
+ paddd m0, m2
+ movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
+ paddd m0, m3
+ psrldq m3, m2, 2
+ punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ paddd m0, m3
+ psrldq m3, m2, 4
+ psrldq m2, 6
+ vpblendd m2, m11, 0x0f ; rounding constant
+ punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
+ pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6]
+ vextracti128 xm2, m1, 1
+ punpcklwd xm1, xm2
+ pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
+ paddd m0, m3
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm1
+ movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmaddwd xm2, xm1, xm10
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; skip packssdw because we only care about one value
+ pminsd xm2, xm12
+ pmaxsd xm2, xm13
+ pextrw [bufq+xq*2], xm2, 0
+ pslldq xm2, 4
+ psrldq xm1, 2
+ pblendw xm1, xm2, 0100b
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax
+%define base r8-generate_grain_uv_%1_16bpc_avx2_table
+ lea r8, [generate_grain_uv_%1_16bpc_avx2_table]
+ movifnidn bdmaxd, bdmaxm
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ lea r6d, [bdmaxq+1]
+ movq xm4, [base+mul_bits]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ movq xm5, [base+hmul_bits]
+ sub r5, r6
+ mova xm6, [base+pb_mask]
+ vpbroadcastd xm2, [base+pw_seed_xor+uvq*4]
+ vpbroadcastw xm7, [base+round+r5*2-2]
+ pxor xm0, xm2
+ lea r6, [gaussian_sequence]
+%if %2
+ mov r7d, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov r5, -44*2
+%else
+ mov r5, -82*73*2
+ sub bufq, r5
+%endif
+.loop_x:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm2, xm3 ; 4 next output seeds
+ pshuflw xm0, xm2, q3333
+ psrlw xm2, 5
+ movq r10, xm2
+ movzx r9d, r10w
+ movd xm2, [r6+r9*2]
+ rorx r9, r10, 32
+ shr r10d, 16
+ pinsrw xm2, [r6+r10*2], 1
+ movzx r10d, r9w
+ pinsrw xm2, [r6+r10*2], 2
+ shr r9d, 16
+ pinsrw xm2, [r6+r9*2], 3
+ paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0
+ pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support
+ movq [bufq+r5], xm2
+ add r5, 8
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec r7d
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r6, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r6, [r8+r6*4]
+ add r6, r8
+ jmp r6
+
+INIT_YMM avx2
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ sar bdmaxd, 1
+ vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4]
+ movd xm6, bdmaxd
+ pcmpeqw m7, m7
+ pmaddubsw m4, m0 ; ar_coeff << (14 - shift)
+ vpbroadcastw m6, xm6 ; max_gain
+ pxor m7, m6 ; min_grain
+ DEFINE_ARGS buf, bufy, h, x
+%if %2
+ vpbroadcastw m5, [base+hmul_bits+2+%3*2]
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+ ; first 32 pixels
+ movu xm0, [bufyq+16*0]
+ vinserti128 m0, [bufyq+16*2], 1
+ movu xm1, [bufyq+16*1]
+ vinserti128 m1, [bufyq+16*3], 1
+%if %3
+ movu xm2, [bufyq+82*2+16*0]
+ vinserti128 m2, [bufyq+82*2+16*2], 1
+ movu xm3, [bufyq+82*2+16*1]
+ vinserti128 m3, [bufyq+82*2+16*3], 1
+ paddw m0, m2
+ paddw m1, m3
+%endif
+ phaddw m0, m1
+ movu xm1, [bufyq+16*4]
+ vinserti128 m1, [bufyq+16*6], 1
+ movu xm2, [bufyq+16*5]
+ vinserti128 m2, [bufyq+16*7], 1
+%if %3
+ movu xm3, [bufyq+82*2+16*4]
+ vinserti128 m3, [bufyq+82*2+16*6], 1
+ paddw m1, m3
+ movu xm3, [bufyq+82*2+16*5]
+ vinserti128 m3, [bufyq+82*2+16*7], 1
+ paddw m2, m3
+%endif
+ phaddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+%else
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*2]
+ movu m1, [bufyq+xq*2+32]
+%endif
+ paddw m0, m0
+ paddw m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+%if %2
+ paddw m0, [bufq+ 0]
+ paddw m1, [bufq+32]
+%else
+ paddw m0, [bufq+xq*2+ 0]
+ paddw m1, [bufq+xq*2+32]
+%endif
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+%if %2
+ movu [bufq+ 0], m0
+ movu [bufq+32], m1
+
+ ; last 6 pixels
+ movu xm0, [bufyq+32*4]
+ movu xm1, [bufyq+32*4+16]
+%if %3
+ paddw xm0, [bufyq+32*4+82*2]
+ paddw xm1, [bufyq+32*4+82*2+16]
+%endif
+ phaddw xm0, xm1
+ movu xm1, [bufq+32*2]
+ pmulhrsw xm0, xm5
+ paddw xm0, xm0
+ pmulhrsw xm0, xm4
+ paddw xm0, xm1
+ pminsw xm0, xm6
+ pmaxsw xm0, xm7
+ vpblendd xm0, xm1, 0x08
+ movu [bufq+32*2], xm0
+%else
+ movu [bufq+xq*2+ 0], m0
+ movu [bufq+xq*2+32], m1
+ add xd, 32
+ cmp xd, 64
+ jl .x_loop_ar0
+
+ ; last 12 pixels
+ movu m0, [bufyq+64*2]
+ movu m1, [bufq+64*2]
+ paddw m0, m0
+ pmulhrsw m0, m4
+ paddw m0, m1
+ pminsw m0, m6
+ pmaxsw m0, m7
+ vpblendd m0, m1, 0xc0
+ movu [bufq+64*2], m0
+%endif
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+INIT_XMM avx2
+.ar1:
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+ DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+ vpbroadcastw xm6, [base+hmul_bits+2+%3*2]
+ vpbroadcastd xm3, xm3
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu xm0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu xm2, [bufyq+xq*4]
+%else
+ movq xm2, [bufyq+xq*2]
+%endif
+%if %2
+%if %3
+ phaddw xm2, [bufyq+xq*4+82*2]
+ punpckhqdq xm1, xm2, xm2
+ paddw xm2, xm1
+%else
+ phaddw xm2, xm2
+%endif
+ pmulhrsw xm2, xm6
+%endif
+ psrldq xm1, xm0, 4 ; top/right
+ punpcklwd xm1, xm2
+ psrldq xm2, xm0, 2 ; top
+ punpcklwd xm0, xm2
+ pmaddwd xm1, xm5
+ pmaddwd xm0, xm4
+ paddd xm1, xm3
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+ RET
+
+INIT_YMM avx2
+.ar2:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign xmm_regs_used 13 + %2
+ %assign stack_size_padded 136
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+%if %2
+ movaps [rsp+16*7], xmm13
+%endif
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vbroadcasti128 m10, [base+gen_shufA]
+ sar bdmaxd, 1
+ vbroadcasti128 m11, [base+gen_shufB]
+ movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
+ pinsrb xm7, [base+pb_1], 5
+ pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
+ movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13
+ pmovsxbw m7, xm7
+ movd xm8, bdmaxd ; max_grain
+ pshufd m4, m7, q0000
+ vpbroadcastw xm12, [base+round_vals-12+shiftq*2]
+ pshufd m5, m7, q1111
+ pcmpeqd xm9, xm9
+ pshufd m6, m7, q2222
+ pxor xm9, xm8 ; min_grain
+ pshufd xm7, xm7, q3333
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%if %2
+ vpbroadcastw xm13, [base+hmul_bits+2+%3*2]
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+.x_loop_ar2:
+ vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5]
+ pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ pmaddwd m0, m4
+ pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, m5
+ punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5]
+%if %2
+ movu xm3, [bufyq+xq*4]
+%if %3
+ paddw xm3, [bufyq+xq*4+82*2]
+%endif
+ phaddw xm3, xm3
+ pmulhrsw xm3, xm13
+%else
+ movq xm3, [bufyq+xq*2]
+%endif
+ punpcklwd xm3, xm12 ; luma, round interleaved
+ vpblendd m2, m3, 0x0f
+ pmaddwd m2, m6
+ paddd m1, m0
+ movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ paddd m2, m1
+ vextracti128 xm1, m2, 1
+ paddd xm2, xm1
+ pshufd xm1, xm0, q3321
+ pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd xm3, xm7, xm0
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd xm3, xm1
+ psrldq xm1, 4
+ pminsd xm3, xm8
+ pmaxsd xm3, xm9
+ pextrw [bufq+xq*2], xm3, 0
+ psrldq xm0, 2
+ pslldq xm3, 2
+ pblendw xm0, xm3, 00000010b
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign stack_offset 32
+ %assign xmm_regs_used 14 + %2
+ %assign stack_size_padded 152
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+ movaps [rsp+16*7], xmm13
+%if %2
+ movaps [rsp+16*8], xmm14
+%endif
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vpbroadcastw xm11, [base+round_vals-12+shiftq*2]
+ sar bdmaxd, 1
+ movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma
+ movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
+ pmovsxbw m7, xm7
+%if %2
+ vpbroadcastw xm14, [base+hmul_bits+2+%3*2]
+%endif
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
+ pinsrb xm0, [base+pb_1], 3
+ pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
+ pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
+ pmovsxbw m0, xm0
+ movd xm12, bdmaxd ; max_grain
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pcmpeqd xm13, xm13
+ punpckhqdq xm10, xm0, xm0
+ pxor xm13, xm12 ; min_grain
+ pinsrw xm10, [base+round_vals-10+shiftq*2], 3
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+.x_loop_ar3:
+ movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
+ movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8]
+ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
+ palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5]
+ palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6]
+ punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ pmaddwd m0, m4
+ pmaddwd m2, m6
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m0, m3
+ movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
+%if %2
+ movu xm3, [bufyq+xq*4]
+%if %3
+ paddw xm3, [bufyq+xq*4+82*2]
+%endif
+ phaddw xm3, xm3
+ pmulhrsw xm3, xm14
+%else
+ movq xm3, [bufyq+xq*2]
+%endif
+ punpcklwd m1, m3
+ pmaddwd m1, m7
+ paddd m0, m1
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ vpblendd m3, m11, 0x0f ; rounding constant
+ punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
+ pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6]
+ psrldq m3, m2, 2
+ punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ paddd m0, m1
+ movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+ paddd m0, m2
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmaddwd xm2, xm1, xm10
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ psrldq xm1, 2
+ ; no need to packssdw since we only care about one value
+ pminsd xm2, xm12
+ pmaxsd xm2, xm13
+ pextrw [bufq+xq*2], xm2, 0
+ pslldq xm2, 4
+ pblendw xm1, xm2, 00000100b
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, unused, sby, see
+%define base r11-grain_min
+ lea r11, [grain_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m8, r9m
+ shr r6d, 11 ; is_12bpc
+ vpbroadcastd m9, [base+grain_min+r6*4]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+grain_max+r6*4]
+ lea r9d, [r6+r9*4]
+ vpbroadcastw m11, [base+mul_bits+r7*2-12]
+ vpbroadcastd m12, [base+fg_min+r10*4]
+ vpbroadcastd m13, [base+fg_max+r9*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m14, [base+pd_16]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m4, m2, 0x55
+ psrld m2, m1, 16
+ mova m9, m6
+ pand m2, m8
+ vpgatherdd m5, [scalingq+m2-2], m6
+ pblendw m5, m3, 0x55
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, [grain_lutq+offxyq*2]
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ movq xm7, [pw_27_17_17_27]
+ cmp dword r8m, 0 ; sby
+ jne .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m4, m2, 0x55
+ psrld m2, m1, 16
+ mova m9, m6
+ pand m2, m8
+ vpgatherdd m5, [scalingq+m2-2], m6
+ pblendw m5, m3, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+ movd xm6, [grain_lutq+left_offxyq*2]
+ punpcklwd xm6, xm3
+ pmaddwd xm6, xm7
+ paddd xm6, xm14
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm9
+ pminsw xm6, xm10
+ vpblendd m3, m6, 0x01
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, m3
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp dword r8m, 0 ; sby
+ jne .loop_x_hv_overlap
+ jmp .loop_x_h_overlap
+
+.vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see, src_bak
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x_v_overlap:
+ vpbroadcastd m15, [pw_27_17_17_27]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_v_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m9, m6
+ pand m4, m8
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+top_offxyq*2]
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ pmaddwd m4, m15
+ pmaddwd m5, m15
+ movu m7, [grain_lutq+offxyq*2+32]
+ movu m6, [grain_lutq+top_offxyq*2+32]
+ paddd m4, m14
+ paddd m5, m14
+ psrad m4, 5
+ psrad m5, 5
+ packssdw m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ paddd m5, m14
+ paddd m6, m14
+ psrad m5, 5
+ psrad m6, 5
+ packssdw m5, m6
+ pmaxsw m4, m9
+ pmaxsw m5, m9
+ pminsw m4, m10
+ pminsw m5, m10
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ paddw m2, m2
+ paddw m3, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hb
+ jz .end_y_v_overlap
+ vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+.end_y_v_overlap:
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+ vpbroadcastd m15, [pw_27_17_17_27]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_hv_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m9, m6
+ pand m4, m8
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+ movd xm6, [grain_lutq+left_offxyq*2]
+ movu m5, [grain_lutq+top_offxyq*2]
+ movd xm4, [grain_lutq+topleft_offxyq*2]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd xm6, xm7
+ punpcklwd xm4, xm5
+ punpcklqdq xm6, xm4
+ movddup xm4, [pw_27_17_17_27]
+ pmaddwd xm6, xm4
+ paddd xm6, xm14
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm9
+ pminsw xm6, xm10
+ pshuflw xm4, xm6, q1032
+ vpblendd m6, m7, 0xfe
+ vpblendd m4, m5, 0xfe
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhwd m5, m7
+ pmaddwd m5, m15
+ punpcklwd m4, m6
+ pmaddwd m4, m15
+ movu m7, [grain_lutq+offxyq*2+32]
+ movu m6, [grain_lutq+top_offxyq*2+32]
+ paddd m5, m14
+ paddd m4, m14
+ psrad m5, 5
+ psrad m4, 5
+ packssdw m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ paddd m5, m14
+ paddd m6, m14
+ psrad m5, 5
+ psrad m6, 5
+ packssdw m5, m6
+ pmaxsw m4, m9
+ pmaxsw m5, m9
+ pminsw m4, m10
+ pminsw m5, m10
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ paddw m2, m2
+ paddw m3, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hb
+ jz .end_y_hv_overlap
+ vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_hv_overlap
+ movq xm7, [pw_27_17_17_27]
+ jmp .loop_y_h_overlap
+.end_y_hv_overlap:
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .loop_x_hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-grain_min
+ lea r12, [grain_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r11d, is_idm
+ mov sbyd, sbym
+ vpbroadcastw m11, [base+mul_bits+r7*2-12]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m8, [base+grain_min+r9*4]
+ shlx r10d, r6d, r9d
+ vpbroadcastd m9, [base+grain_max+r9*4]
+ vpbroadcastw m10, r13m
+ shlx r6d, r6d, r11d
+ vpbroadcastd m12, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m13, [base+fg_max+r6*4]
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused, sby, see, overlap
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+pb_8_9_0_1]
+ vpbroadcastd m1, [base+uv_offset_mul+r9*4]
+ vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m0 ; { uv_luma_mult, uv_mult }
+ pmaddwd m15, m1
+%else
+%if %2
+ vpbroadcastq m15, [base+pw_23_22]
+%else
+ vpbroadcastq m15, [base+pw_27_17_17_27]
+%endif
+ vpbroadcastd m14, [base+pd_16]
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lumaq, r9mp
+ mov lstrideq, r10mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r9mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq+ 0]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10
+ pminuw m3, m10 ; clip_pixel()
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, [grain_lutq+offxyq*2]
+%if %2
+ pmulhrsw m5, [grain_lutq+offxyq*2+82*2]
+%else
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+%endif
+
+ ; dst = clip_pixel(src, noise)
+%if %1
+ paddw m0, m4
+ paddw m1, m5
+%else
+ paddw m0, m4, [srcq]
+%if %2
+ paddw m1, m5, [srcq+strideq]
+%else
+ paddw m1, m5, [srcq+32]
+%endif
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+%else
+ dec hb
+%endif
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if %2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+%else
+ movu m3, [grain_lutq+offxyq*2+32]
+%endif
+ movd xm6, [grain_lutq+left_offxyq*2]
+%if %2
+ pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
+ punpckldq xm7, xm2, xm3 ; {cur0, cur1}
+ punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1}
+%else
+ punpcklwd xm6, xm2
+%endif
+%if %1
+%if %2
+ vpbroadcastq xm7, [pw_23_22]
+%else
+ movq xm7, [pw_27_17_17_27]
+%endif
+ pmaddwd xm6, xm7
+ vpbroadcastd xm7, [pd_16]
+ paddd xm6, xm7
+%else
+ pmaddwd xm6, xm15
+ paddd xm6, xm14
+%endif
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm8
+ pminsw xm6, xm9
+ vpblendd m2, m6, 0x01
+%if %2
+ pshuflw xm6, xm6, q1032
+ vpblendd m3, m6, 0x01
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+%if %1
+ paddw m0, m2
+ paddw m1, m3
+%else
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, r10mp
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+%else
+ dec hb
+%endif
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov lumaq, r9mp
+ mov lstrideq, r10mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r9mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ lea r10, [pw_27_17_17_27]
+%endif
+%%loop_y_v_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq*2]
+ movu m3, [grain_lutq+top_offxyq*2]
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6 ; { top, cur }
+%if %3
+ vpbroadcastd m0, [pw_23_22]
+%elif %2
+ vpbroadcastd m0, [pw_27_17_17_27]
+%else
+ vpbroadcastd m0, [r10]
+%endif
+ REPX {pmaddwd x, m0}, m2, m3
+%if %1
+ vpbroadcastd m1, [pd_16]
+ REPX {paddd x, m1}, m2, m3
+%else
+ REPX {paddd x, m14}, m2, m3
+%endif
+ REPX {psrad x, 5}, m2, m3
+ packssdw m2, m3
+%if %2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+%else
+ movu m3, [grain_lutq+offxyq*2+32]
+%endif
+%if %3
+ pmaxsw m2, m8
+ pminsw m2, m9
+%else
+%if %2
+ movu m7, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m6, m3, m7 ; { cur, top }
+ punpcklwd m3, m7
+%else
+ movu m7, [grain_lutq+top_offxyq*2+32]
+ punpckhwd m6, m7, m3
+ punpcklwd m3, m7, m3 ; { top, cur }
+%endif
+ pmaddwd m6, m0
+ pmaddwd m3, m0
+%if %1
+ paddd m6, m1
+ paddd m3, m1
+%else
+ paddd m6, m14
+ paddd m3, m14
+%endif
+ psrad m6, 5
+ psrad m3, 5
+ packssdw m3, m6
+ pmaxsw m2, m8
+ pmaxsw m3, m8
+ pminsw m2, m9
+ pminsw m3, m9
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ sub hb, 2
+%else
+ mova [dstq+32], m1
+ dec hb
+%endif
+ jle %%end_y_v_overlap
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ jmp %%loop_y
+%else
+ add hd, 0x80000000
+ jc %%loop_y
+ add r10, 4
+ jmp %%loop_y_v_overlap
+%endif
+%%end_y_v_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+%if %2 == 0
+ lea r14, [pw_27_17_17_27]
+%endif
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%%loop_y_hv_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m0, [grain_lutq+offxyq*2]
+ movd xm2, [grain_lutq+left_offxyq*2]
+ movu m6, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+ punpckldq xm1, xm0, xm3 ; { cur0, cur1 }
+%if %3
+ vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
+ vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 }
+%else
+ vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1
+ vpbroadcastd m7, [grain_lutq+topleft_offxyq*2]
+ vpblendd m2, m7, 0x20
+ movd xm7, [grain_lutq+top_offxyq*2+82*2]
+ punpckldq xm7, xm6
+ vinserti128 m1, xm7, 1
+ movu m7, [grain_lutq+top_offxyq*2+82*2]
+%endif
+ punpcklwd m2, m1 ; { cur, left }
+%if %1
+ vpbroadcastq m1, [pw_23_22]
+ pmaddwd m2, m1
+ vpbroadcastd m1, [pd_16]
+ paddd m2, m1
+ psrad m2, 5
+ packssdw m2, m2
+ vpermq m2, m2, q3120
+%else
+ pmaddwd m2, m15
+ paddd m2, m14
+ psrad m2, 5
+ vextracti128 xm1, m2, 1
+ packssdw xm2, xm1
+%endif
+%else
+ pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1
+ movu m3, [grain_lutq+offxyq*2+32]
+ movu m7, [grain_lutq+top_offxyq*2+32]
+ punpckldq xm1, xm0, xm6
+ punpcklwd xm2, xm1 ; { cur, left }
+%if %1
+ movddup xm1, [pw_27_17_17_27]
+ pmaddwd xm2, xm1
+ vpbroadcastd m1, [pd_16]
+ paddd xm2, xm1
+%else
+ pmaddwd xm2, xm15
+ paddd xm2, xm14
+%endif
+ psrad xm2, 5
+ packssdw xm2, xm2
+%endif
+ pmaxsw xm2, xm8
+ pminsw xm2, xm9
+ vpblendd m0, m2, 0x01
+%if %2
+ pshufd xm2, xm2, q0321
+ vpblendd m3, m2, 0x01
+%if %3 == 0
+ pshufd xm2, xm2, q0321
+ vpblendd m7, m2, 0x01
+%endif
+%endif
+ pshuflw xm2, xm2, q1032
+ vpblendd m2, m6, 0xfe
+ punpckhwd m6, m0 ; { top, cur }
+ punpcklwd m2, m0
+%if %3
+ vpbroadcastd m0, [pw_23_22]
+%elif %2
+ vpbroadcastd m0, [pw_27_17_17_27]
+%else
+ vpbroadcastd m0, [r14]
+%endif
+ pmaddwd m6, m0
+ pmaddwd m2, m0
+%if %1
+ paddd m6, m1
+ paddd m2, m1
+%else
+ paddd m6, m14
+ paddd m2, m14
+%endif
+ psrad m6, 5
+ psrad m2, 5
+ packssdw m2, m6
+
+%if %3
+ pmaxsw m2, m8
+ pminsw m2, m9
+%else
+%if %2
+ punpckhwd m6, m3, m7
+ punpcklwd m3, m7 ; { cur, top }
+%else
+ punpckhwd m6, m7, m3
+ punpcklwd m3, m7, m3 ; { top, cur }
+%endif
+ REPX {pmaddwd x, m0}, m6, m3
+%if %1
+ REPX {paddd x, m1}, m6, m3
+%else
+ REPX {paddd x, m14}, m6, m3
+%endif
+ REPX {psrad x, 5}, m6, m3
+ packssdw m3, m6
+ pmaxsw m2, m8
+ pmaxsw m3, m8
+ pminsw m2, m9
+ pminsw m3, m9
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, r10mp
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+ jg %%loop_y_h_overlap
+%else
+ dec hb
+ jle %%end_y_hv_overlap
+ add hd, 0x80000000
+ jc %%loop_y_h_overlap
+ add r14, 4
+ jmp %%loop_y_hv_overlap
+%endif
+%%end_y_hv_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%loop_x_hv_overlap
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+GEN_GRAIN_UV_FN 420, 1, 1
+FGUV_FN 420, 1, 1
+GEN_GRAIN_UV_FN 422, 1, 0
+FGUV_FN 422, 1, 0
+GEN_GRAIN_UV_FN 444, 0, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx512.asm b/third_party/dav1d/src/x86/filmgrain16_avx512.asm
new file mode 100644
index 0000000000..00dd6af599
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_avx512.asm
@@ -0,0 +1,932 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
+ db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1
+scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4
+pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27
+pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+scale_rnd: dd 64
+ dd 16
+uv_offset_mul: dd 256
+ dd 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, offx, sby, see, offy, src_bak
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m6, r9m
+ shr r6d, 11 ; is_12bpc
+ vbroadcasti32x4 m7, [base+scale_mask]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+scale_shift+r7*4-32]
+ lea r9d, [r6+r9*4]
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m9, [base+fg_max+r9*4]
+ mov r12, 0xeeeeeeeeeeeeeeee
+ vpbroadcastd m19, [base+scale_rnd+r6*4]
+ kshiftrb k2, k1, 4 ; 0xf
+ vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8]
+ kmovq k3, r12
+ vpbroadcastd m11, [base+scale_shift+r6*8+4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0]
+ vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1
+ punpckldq xm16, xm4, xm5
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ psrad xm16, 1
+ packssdw xm16, xm16
+ vpsravw xm16, xm11
+ vmovdqu8 m4{k2}, m16
+ vpalignr m5{k2}, m16, m16, 4
+ call .add_noise
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, _, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m16, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movu m17, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m4, m0, m16
+ punpcklwd m0, m16
+ punpckhwd m5, m1, m17
+ punpcklwd m1, m17
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to .hv_overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+73]
+ lea left_offxyd, [offyq+73]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m5, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ movd xm18, [grain_lutq+left_offxyq*2+82*1]
+ pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
+ punpckldq xm16, xm5, xm0
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ punpckldq xm17, xm2, xm1
+ punpcklwd xm18, xm17
+ mova xm17, xm19
+ vpdpwssd xm17, xm20, xm18
+ punpckhwd m4, m0, m5
+ punpcklwd m0, m5
+ punpckhwd m5, m1, m2
+ punpcklwd m1, m2
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm11
+ vpshuflw m0{k2}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m1{k2}, m16, q1302
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ mova m2, m19
+ vpdpwssd m2, m12, m4
+ mova m3, m19
+ vpdpwssd m3, m13, m5
+ mova m4, m19
+ vpdpwssd m4, m12, m0
+ mova m5, m19
+ vpdpwssd m5, m13, m1
+ REPX {psrad x, 1}, m2, m3, m4, m5
+ packssdw m4, m2
+ packssdw m5, m3
+ vpsravw m4, m11
+ vpsravw m5, m11
+.add_noise:
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ kmovw k4, k1
+ pand m16, m6, m0
+ psrld m3, m0, 16
+ vpgatherdd m2{k4}, [scalingq+m16]
+ vpcmpud k4, m3, m6, 2 ; px <= bdmax
+ vpgatherdd m16{k4}, [scalingq+m3]
+ kmovw k4, k1
+ pand m17, m6, m1
+ vpgatherdd m3{k4}, [scalingq+m17]
+ vpshufb m2{k3}, m16, m7
+ psrld m16, m1, 16
+ vpcmpud k4, m16, m6, 2
+ vpgatherdd m17{k4}, [scalingq+m16]
+ vpshufb m3{k3}, m17, m7
+ vpsllvw m2, m10
+ vpsllvw m3, m10
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+ add grain_lutq, 82*4
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m8
+ pmaxsw m1, m8
+ pminsw m0, m9
+ pminsw m1, m9
+ mova [dstq+srcq], m0
+ add srcq, strideq
+ mova [dstq+srcq], m1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-fg_min
+ lea r12, [fg_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r11d, is_idm
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m5, r13m
+ mov r13, 0xeeeeeeeeeeeeeeee
+ vbroadcasti32x4 m6, [base+scale_mask]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m7, [base+scale_shift+r7*4-32]
+ shlx r10d, r6d, r9d
+ mov sbyd, sbym
+ shlx r6d, r6d, r11d
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m9, [base+fg_max+r6*4]
+ kmovq k2, r13
+ vpbroadcastd m20, [base+scale_rnd+r9*4]
+ packssdw m4, m5, m5
+ vpbroadcastd m21, [base+scale_shift+r9*8+4]
+%if %2
+ mova m12, [base+pb_0to63] ; pw_even
+ mov r13d, 0x0101
+ vpbroadcastq m10, [base+pw_23_22+r9*8]
+ kmovw k3, r13d
+%if %3
+ pshufd m11, m10, q0000
+%else
+ vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0]
+ vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4]
+ vmovdqu16 m11{k1}, m16
+%endif
+ psrlw m13, m12, 8 ; pw_odd
+%else
+ vpbroadcastq m10, [base+pw_27_17_17_27+r9*8]
+ kshiftrb k3, k1, 7 ; 0x01
+ kshiftrb k4, k1, 4 ; 0x0f
+ pshufd m11, m10, q0000
+%endif
+ mov lstrideq, r10mp
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+uv_offset_mul+r9*4]
+ vpbroadcastd m1, [base+pb_8_9_0_1]
+ vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4]
+ vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
+ pmaddwd m14, m0
+ pshufb m15, m1 ; { uv_luma_mult, uv_mult }
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ mova m17, m20
+ vpdpwssd m17, m16, m10
+ psrad m17, 1
+ packssdw m17, m17
+ vpsravw m17, m21
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1
+ punpckldq xm17, xm18, xm19
+ punpcklwd xm16, xm17
+ mova xm17, xm20
+ vpdpwssd xm17, xm16, xm10
+ psrad xm17, 1
+ packssdw xm17, xm17
+ vpsravw xm17, xm21
+%endif
+ vmovdqa32 m18{k3}, m17
+ vpshufd m19{k3}, m17, q0321
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, _, top_offxy
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu ym16, [grain_lutq+offxyq*2+82*0]
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd ym17, ym1, ym16
+ punpckhwd ym1, ym16
+%elif %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym17, [grain_lutq+top_offxyq*2+82*0]
+ vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd m16, m17, m18
+ punpckhwd m17, m18
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m16, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m16
+ punpcklwd m2, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to %%v_overlap, and instead always fall-through to %%hv_overlap
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+topleft_offxyq*2+82*0]
+ mova m0, m20
+ vpdpwssd m0, m16, m10
+%if %3
+ punpcklwd xm17, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ psrad xm16, 1
+%else
+ vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2
+ punpcklwd m17, m1
+ mova m16, m20
+ vpdpwssd m16, m17, m10
+ psrad m16, 1
+%endif
+ psrad m0, 1
+ packssdw m0, m16
+ vpsravw m0, m21
+ vmovdqa32 m18{k3}, m0
+ vpshufd m19{k3}, m0, q0321
+%if %3
+ vpunpckhdq ym1{k3}, ym0, ym0
+ punpcklwd ym17, ym1, ym18
+ punpckhwd ym1, ym18
+%else
+ vpunpckhdq m1{k3}, m0, m0
+ punpcklwd m16, m1, m18
+ punpckhwd m17, m1, m18
+%endif
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
+ punpckldq xm16, xm18, xm19
+ punpcklwd xm17, xm16
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m0, [grain_lutq+top_offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*2]
+ pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
+ punpckldq xm1, xm2, xm0
+ punpcklwd xm1, xm16, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ mova xm17, xm20
+ vpdpwssd xm17, xm1, xm10
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m0
+ punpcklwd m2, m0
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm21
+ vpshuflw m19{k4}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m2{k4}, m16, q3120
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%hv_overlap
+
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ mova ym16, ym20
+ vpdpwssd ym16, ym17, ym11
+ mova ym17, ym20
+ vpdpwssd ym17, ym1, ym11
+ psrad ym16, 1
+ psrad ym17, 1
+ packssdw ym16, ym17
+ vpsravw m18{k1}, m16, m21
+%elif %2
+ mova m18, m20
+ vpdpwssd m18, m16, m11
+ mova m16, m20
+ vpdpwssd m16, m17, m11
+ psrad m18, 1
+ psrad m16, 1
+ packssdw m18, m16
+ vpsravw m18, m21
+%else
+ mova m16, m20
+ vpdpwssd m16, m1, m11
+ mova m17, m20
+ vpdpwssd m17, m18, m11
+ mova m18, m20
+ vpdpwssd m18, m19, m11
+ mova m19, m20
+ vpdpwssd m19, m2, m11
+ REPX {psrad x, 1}, m16, m17, m18, m19
+ packssdw m18, m16
+ packssdw m19, m17
+ vpsravw m18, m21
+ vpsravw m19, m21
+%endif
+%%add_noise:
+%if %2
+ mova m2, [lumaq+lstrideq*(0<<%3)]
+ mova m0, [lumaq+lstrideq*(1<<%3)]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova m3, [lumaq+lstrideq*(0<<%3)]
+ mova m1, [lumaq+lstrideq*(1<<%3)]
+ mova m16, m12
+ vpermi2w m16, m2, m0
+ vpermt2w m2, m13, m0
+ mova m17, m12
+ vpermi2w m17, m3, m1
+ vpermt2w m3, m13, m1
+ pavgw m2, m16
+ pavgw m3, m17
+%elif %1
+ mova m2, [lumaq+lstrideq*0]
+ mova m3, [lumaq+lstrideq*1]
+%endif
+%if %2
+ mova ym16, [srcq+strideq*0]
+ vinserti32x8 m16, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+%else
+ mova m16, [srcq+strideq*0]
+%endif
+%if %1
+ punpckhwd m17, m2, m16
+ mova m0, m14
+ vpdpwssd m0, m17, m15
+ punpcklwd m17, m2, m16
+ mova m2, m14
+ vpdpwssd m2, m17, m15
+%endif
+%if %2
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%else
+ mova m17, [srcq+strideq*1]
+%endif
+%if %1
+ psrad m0, 6
+ psrad m2, 6
+ packusdw m2, m0
+ punpckhwd m0, m3, m17
+ mova m1, m14
+ vpdpwssd m1, m15, m0
+ punpcklwd m0, m3, m17
+ mova m3, m14
+ vpdpwssd m3, m15, m0
+ psrad m1, 6
+ psrad m3, 6
+ packusdw m3, m1
+ pminuw m2, m4
+ pminuw m3, m4
+
+.add_noise_main:
+ ; scaling[luma_src]
+ kmovw k5, k1
+ pand m1, m5, m2
+ vpgatherdd m0{k5}, [scalingq+m1]
+ kmovw k5, k1
+ psrld m2, 16
+ vpgatherdd m1{k5}, [scalingq+m2]
+ vpshufb m0{k2}, m1, m6
+ kmovw k5, k1
+ psrld m1, m3, 16
+ vpgatherdd m2{k5}, [scalingq+m1]
+ kmovw k5, k1
+ pand m3, m5
+ vpgatherdd m1{k5}, [scalingq+m3]
+ vpshufb m1{k2}, m2, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ vpsllvw m0, m7
+ vpsllvw m1, m7
+ pmulhrsw m18, m0
+ pmulhrsw m19, m1
+ add grain_lutq, 82*(4<<%2)
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*2]
+ paddw m16, m18
+ paddw m17, m19
+ pmaxsw m16, m8
+ pmaxsw m17, m8
+ pminsw m16, m9
+ pminsw m17, m9
+%if %2
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+ lea dstq, [dstq+strideq*2]
+ mova [dstq+strideq*0], ym17
+ vextracti32x8 [dstq+strideq*1], m17, 1
+%else
+ mova [dstq+strideq*0], m16
+ mova [dstq+strideq*1], m17
+%endif
+ lea dstq, [dstq+strideq*2]
+ ret
+%else
+%if %2
+ pand m2, m4
+ pand m3, m4
+%else
+ pand m2, m4, [lumaq+lstrideq*0]
+ pand m3, m4, [lumaq+lstrideq*1]
+%endif
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif
diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
new file mode 100644
index 0000000000..6b0daaac0b
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -0,0 +1,3421 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA 16
+pd_16: times 4 dd 16
+pw_1: times 8 dw 1
+pw_16384: times 8 dw 16384
+pw_8192: times 8 dw 8192
+pw_23_22: dw 23, 22
+ times 3 dw 0, 32
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+pw_27_17_17_27: dw 27, 17, 17, 27
+ times 2 dw 0, 32
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512, 1024
+max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
+min: dw 0, 16*4, 16*16
+; these two should be next to each other
+pw_4: times 2 dw 4
+pw_16: times 2 dw 16
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%undef base
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 8
+%define %%tmp %8
+%endif
+%rep (%6/2)
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %6 == 8
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4*%7]
+%else
+ pinsrw %1, [%3+%4*%7], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5*%7], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+%macro SPLATD 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
+ lea r4, [pb_mask]
+%define base r4-pb_mask
+%else
+cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
+ LEA r4, $$
+%define base r4-$$
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r3d, [fg_dataq+FGData.grain_scale_shift]
+ lea r5d, [bdmaxq+1]
+ shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r3, r5
+ SPLATW m6, [base+round+r3*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+ mov r3, -73*82*2
+ sub bufq, r3
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+.loop:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r5, r7, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+r3], m3
+ add r3, 4*2
+ jl .loop
+
+ ; auto-regression code
+ movsxd r3, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
+ lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
+ jmp r3
+
+.ar1:
+%if WIN64
+ DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
+ lea bufq, [r0-2*(82*73-(82*3+79))]
+ PUSH r8
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+ DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
+ PUSH r6
+%define shiftd r1d
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+%if WIN64
+ DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
+%elif ARCH_X86_64
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+%undef shiftd
+ DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
+%define hd dword r0m
+%define maxd dword minm
+%endif
+%if cpuflag(sse4)
+ pmovsxbw m4, m4
+%else
+ pxor m3, m3
+ pcmpgtb m3, m4
+ punpcklbw m4, m3
+%endif
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+%if WIN64
+ POP r8
+%elif ARCH_X86_32
+ POP r6
+%undef maxd
+%undef hd
+%endif
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m0, [base+round_vals-12+shiftq*2]
+ pshuflw m0, m0, q0000
+ movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11
+ pxor m2, m2
+ punpcklwd m0, m2
+ pcmpgtb m2, m6
+ punpckhbw m3, m6, m2
+ punpcklbw m6, m2
+ pshufd m2, m6, q3333
+ pshufd m1, m6, q2222
+ pshufd m7, m6, q1111
+ pshufd m6, m6, q0000
+ pshufd m4, m3, q1111
+ pshufd m3, m3, q0000
+%if ARCH_X86_64
+ SWAP 0, 12
+ SWAP 1, 8
+ SWAP 2, 9
+ SWAP 3, 10
+ SWAP 4, 11
+%else
+%define m12 [rsp+0*16]
+%define m8 [rsp+1*16]
+%define m9 [rsp+2*16]
+%define m10 [rsp+3*16]
+%define m11 [rsp+4*16]
+ mova m12, m0
+ mova m8, m1
+ mova m9, m2
+ mova m10, m3
+ mova m11, m4
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m0, bdmaxd ; max_grain
+ pcmpeqw m1, m1
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ psrldq m2, 14
+ pslldq m2, 2
+ pxor m2, m1
+%endif
+ pxor m1, m0 ; min_grain
+%if ARCH_X86_64
+ SWAP 0, 13
+ SWAP 1, 14
+ SWAP 2, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+ mova m13, m0
+ mova m14, m1
+%if !cpuflag(sse4)
+%define m15 [rsp+7*16]
+ mova m15, m2
+%endif
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m2, m0, 2
+ psrldq m3, m0, 4
+ psrldq m4, m0, 6
+ psrldq m5, m0, 8
+ punpcklwd m0, m2
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ psrldq m2, m1, 2
+ psrldq m4, m1, 4
+ punpcklwd m2, m4
+ psrldq m4, m1, 6
+ psrldq m1, 8
+ punpcklwd m4, m1
+ pmaddwd m0, m6
+ pmaddwd m3, m7
+ pmaddwd m5, m8
+ pmaddwd m2, m9
+ pmaddwd m4, m10
+ paddd m0, m3
+ paddd m5, m2
+ paddd m0, m4
+ paddd m0, m5 ; accumulated top 2 rows
+ paddd m0, m12
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m4, m1, q3321
+ pxor m2, m2
+ pcmpgtw m2, m4
+ punpcklwd m4, m2 ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd m2, m1, m11
+ paddd m2, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ paddd m2, m4
+ packssdw m2, m2
+ pminsw m2, m13
+ pmaxsw m2, m14
+ psrldq m4, 4
+ pslldq m2, 2
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000010b
+%else
+ pand m1, m15
+ pandn m3, m15, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixel, this should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 64
+ %define tmp rsp
+%elif ARCH_X86_64
+ %define tmp rsp+stack_offset-72
+%else
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*12
+ %define tmp rsp
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m7, bdmaxd ; max_grain
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m4, m4
+ psrldq m4, 14
+ pslldq m4, 4
+ pxor m4, m6
+%endif
+ pxor m6, m7 ; min_grain
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+
+%if ARCH_X86_64
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m14 [rsp+10*16]
+%define m15 [esp+11*16]
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; build cf0-1 until 18-19 in m5-12 and r0/1
+ pxor m1, m1
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+
+%if cpuflag(sse4)
+ pshufd m4, m2, q3333
+%else
+ pshufd m5, m2, q3333
+ mova [tmp+48], m5
+%endif
+ pshufd m3, m2, q2222
+ pshufd m1, m2, q0000
+ pshufd m2, m2, q1111
+ pshufd m7, m0, q2222
+ pshufd m6, m0, q1111
+ pshufd m5, m0, q0000
+ pshufd m0, m0, q3333
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+4*16]
+%define m9 [esp+5*16]
+%define m10 [rsp+6*16]
+%define m11 [esp+7*16]
+%define m12 [rsp+8*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+ ; build cf20,round in r2
+ ; build cf21-23,round*2 in m13
+ pxor m1, m1
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ mova [tmp+ 0], m1
+ mova [tmp+16], m2
+ psrldq m3, m0, 10
+ pinsrw m3, [base+round_vals+shiftq*2-10], 3
+
+%if ARCH_X86_64
+ SWAP 3, 13
+%else
+%define m13 [esp+9*16]
+ mova m13, m3
+%endif
+
+ pinsrw m0, [base+round_vals+shiftq*2-12], 5
+ pshufd m3, m0, q2222
+ mova [tmp+32], m3
+
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
+
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6]
+ punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+
+ pmaddwd m1, m8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+ pmaddwd m2, m11
+ paddd m1, m4
+ paddd m3, m2
+ paddd m0, m1
+ paddd m0, m3
+ ; m0 = top 2 lines multiplied by cf
+
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, [base+pw_1]
+
+%if cpuflag(sse4)
+ pmaddwd m1, m12
+%else
+ pmaddwd m1, [tmp+48]
+%endif
+ pmaddwd m3, [tmp+ 0]
+ pmaddwd m4, [tmp+16]
+ pmaddwd m2, [tmp+32]
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m1
+ paddd m0, m4
+ ; m0 = top 3 lines multiplied by cf plus rounding for downshift
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m15
+ pmaxsw m2, m14
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m12
+ pandn m3, m12, m2
+ por m1, m3
+%endif
+ ; overwrite a couple of pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+ movifnidn bdmaxd, bdmaxm
+ lea r6d, [bdmaxq+1]
+%else
+cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
+%define base r2-$$
+ LEA r2, $$
+ mov fg_dataq, r2m
+ mov r6d, r4m
+ inc r6d
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r5, r6
+ SPLATW m6, [base+round+r5*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+%if ARCH_X86_64
+ SPLATW m2, [base+pw_seed_xor+uvq*4]
+%else
+ mov r5d, r3m
+ SPLATW m2, [base+pw_seed_xor+r5*4]
+%endif
+ pxor m0, m2
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+%if %2
+ mov hd, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov xq, -44
+%else
+ mov xq, -82*73
+ add bufq, 82*73*2
+%endif
+.loop_x:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r9, r10, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+xq*2], m3
+ add xq, 4
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec hd
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*2
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ SPLATW m3, [base+hmul_bits+shiftq*2-10]
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m1, bdmaxd ; max_gain
+%else
+ SPLATW m1, r4m
+ psraw m1, 1
+%endif
+ pcmpeqw m7, m7
+ pxor m7, m1 ; min_grain
+%if ARCH_X86_64
+ SWAP 1, 14
+ DEFINE_ARGS buf, bufy, h, x
+%else
+%define m14 [rsp+0*16]
+ mova m14, m1
+ DEFINE_ARGS buf, bufy, pic_reg, h, x
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATW m4, m4
+ pxor m5, m5
+%if %2
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ pslldq m2, 12
+%if ARCH_X86_64
+ SWAP 2, 12
+%else
+%define m12 [rsp+1*16]
+ mova m12, m2
+%endif
+%endif
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ ; first 32 pixels
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*(2<<%2)]
+%if %2
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ paddw m0, m2
+%endif
+ movu m1, [bufyq+xq*4 +16]
+%if %3
+ movu m2, [bufyq+xq*4+82*2+16]
+ paddw m1, m2
+%endif
+ phaddw m0, m1
+ pmulhrsw m0, m6
+%endif
+ punpckhwd m1, m0, m5
+ punpcklwd m0, m5
+ REPX {pmaddwd x, m4}, m0, m1
+ REPX {psrad x, 5}, m0, m1
+ packssdw m0, m1
+ pmulhrsw m0, m3
+ movu m1, [bufq+xq*2]
+ paddw m0, m1
+ pminsw m0, m14
+ pmaxsw m0, m7
+ cmp xd, 72-40*%2
+ je .end
+ movu [bufq+xq*2], m0
+ add xd, 8
+ jmp .x_loop_ar0
+
+ ; last 6/4 pixels
+.end:
+%if %2
+%if cpuflag(sse4)
+ pblendw m0, m1, 11000000b
+%else
+ pand m1, m12
+ pandn m2, m12, m0
+ por m0, m1, m2
+%endif
+ movu [bufq+xq*2], m0
+%else
+ movq [bufq+xq*2], m0
+%endif
+
+ add bufq, 82*2
+ add bufyq, 82*(2<<%3)
+ dec hd
+ jg .y_loop_ar0
+%if ARCH_X86_32
+%undef m12
+%undef m14
+%endif
+ RET
+
+.ar1:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
+%else
+%assign stack_offset stack_offset_old
+%xdefine rstk rsp
+%assign stack_size_padded 0
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+%if WIN64
+ DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
+%if %2
+ lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
+%else
+ lea bufq, [r0-2*(82*69+3)]
+%endif
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
+%else
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
+%define hd dword r1m
+%define mind dword r3m
+%define maxd dword r4m
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+%endif
+%if ARCH_X86_64
+ mov shiftd, [r2+FGData.ar_coeff_shift]
+%else
+ mov shiftd, [r3+FGData.ar_coeff_shift]
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5 ; cf0-4 in words
+ pshuflw m4, m4, q2100
+ psrldq m4, 2 ; cf0-3,4 in words
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pxor m6, m6
+ punpcklwd m3, m6
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATD m3, m3
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+%if ARCH_X86_64
+ mov mind, maxd
+ xor mind, -1
+%else
+ DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
+ mov r2, maxd
+ xor r2, -1
+ mov mind, r2
+%endif
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu m7, [bufyq+xq*4]
+%if %3
+ movu m1, [bufyq+xq*4+82*2]
+ phaddw m7, m1
+%else
+ phaddw m7, m7
+%endif
+%else
+ movq m7, [bufyq+xq*2]
+%endif
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+%if %2
+%if %3
+ pshufd m2, m7, q3232
+ paddw m7, m2
+%endif
+ pmulhrsw m7, m6
+%endif
+ punpcklwd m1, m7
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+%if ARCH_X86_32
+%undef maxd
+%undef mind
+%undef hd
+%endif
+ RET
+
+.ar2:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+ ALLOC_STACK -16*8
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m5, bdmaxd ; max_grain
+%else
+ SPLATW m5, r4m
+ psraw m5, 1
+%endif
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m7, m7
+ psrldq m7, 14
+ pslldq m7, 2
+ pxor m7, m6
+%endif
+ pxor m6, m5 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m7, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 5, 13
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+%define m15 [rsp+7*16]
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; coef values
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pinsrw m2, [base+round_vals-12+shiftq*2], 5
+
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m1, m0, q3333
+ pshufd m0, m0, q2222
+ pshufd m3, m2, q1111
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+0*16]
+%define m9 [rsp+1*16]
+%define m10 [rsp+2*16]
+%define m11 [rsp+3*16]
+%define m12 [rsp+4*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m4, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m1, m0, 4 ; y=-2,x=[-0,+5]
+ psrldq m3, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m2, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ pmaddwd m2, m8
+ paddd m0, m1
+ paddd m0, m2
+ psrldq m3, m5, 2 ; y=-1,x=[-1,+5]
+ psrldq m1, m5, 4 ; y=-1,x=[-0,+5]
+ psrldq m4, m5, 6 ; y=-1,x=[+1,+5]
+ psrldq m2, m5, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ punpcklwd m4, m2
+ pmaddwd m3, m9
+ pmaddwd m4, m10
+ paddd m3, m4
+ paddd m0, m3
+
+ ; luma component & rounding
+%if %2
+ movu m1, [bufyq+xq*4]
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ phaddw m1, m2
+ pshufd m2, m1, q3232
+ paddw m1, m2
+%else
+ phaddw m1, m1
+%endif
+%if cpuflag(sse4)
+ pmulhrsw m1, m15
+%elif %3
+ pmulhrsw m1, [base+pw_8192]
+%else
+ pmulhrsw m1, [base+pw_16384]
+%endif
+%else
+ movq m1, [bufyq+xq*2]
+%endif
+ punpcklwd m1, [base+pw_1]
+ pmaddwd m1, m12
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m2, m1, q3321
+ pxor m3, m3
+ pcmpgtw m3, m2
+ punpcklwd m2, m3 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd m3, m1, m11
+ paddd m3, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd m3, m2
+ packssdw m3, m3
+ pminsw m3, m13
+ pmaxsw m3, m14
+ psrldq m1, 2
+ pslldq m3, 2
+ psrldq m2, 4
+%if cpuflag(sse4)
+ pblendw m1, m3, 00000010b
+%else
+ pand m1, m15
+ pandn m4, m15, m3
+ por m1, m4
+%endif
+ ; overwrite previous pixel, should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 96
+ %define tmp rsp
+%else
+ %define tmp rsp+stack_offset-120
+%endif
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+ mov bufyq, r1m
+ mov uvd, r3m
+ %define tmp rsp
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ SPLATW m4, [base+round_vals-12+shiftq*2]
+ pxor m5, m5
+ pcmpgtw m5, m4
+ punpcklwd m4, m5
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m6, bdmaxd ; max_grain
+%else
+ SPLATW m6, r4m
+ psraw m6, 1
+%endif
+ pcmpeqw m7, m7
+%if !cpuflag(sse4)
+ pcmpeqw m3, m3
+ psrldq m3, 14
+ pslldq m3, 4
+ pxor m3, m7
+%endif
+ pxor m7, m6 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m3, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m11 [rsp+ 9*16]
+%define m12 [rsp+10*16]
+%define m14 [rsp+12*16]
+%define m15 [rsp+13*16]
+ mova m11, m3
+ mova m12, m4
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; cf from y=-3,x=-3 until y=-3,x=-2
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m0, m0, q3333
+ pshufd m5, m2, q0000
+ pshufd m6, m2, q1111
+ mova [tmp+16*0], m1
+ mova [tmp+16*1], m3
+ mova [tmp+16*2], m4
+ mova [tmp+16*3], m0
+ mova [tmp+16*4], m5
+ mova [tmp+16*5], m6
+ pshufd m6, m2, q2222
+ pshufd m7, m2, q3333
+
+ ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1 ; luma
+ punpcklbw m0, m1
+ pshufd m3, m0, q3232
+ psrldq m5, m0, 10
+ ; y=0,x=[-3 to -1] + "1.0" for current pixel
+ pinsrw m5, [base+round_vals-10+shiftq*2], 3
+ ; y=-1,x=[-1 to +2]
+ pshufd m1, m0, q0000
+ pshufd m0, m0, q1111
+ ; y=-1,x=+3 + luma
+ punpcklwd m3, m2
+ pshufd m3, m3, q0000
+
+%if ARCH_X86_64
+ SWAP 1, 8
+ SWAP 0, 9
+ SWAP 3, 10
+ SWAP 5, 13
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+%define m8 [rsp+ 6*16]
+%define m9 [rsp+ 7*16]
+%define m10 [rsp+ 8*16]
+%define m13 [rsp+11*16]
+ mova m8, m1
+ mova m9, m0
+ mova m10, m3
+ mova m13, m5
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ ; first line
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, [tmp+0*16]
+ pmaddwd m2, [tmp+1*16]
+ pmaddwd m3, [tmp+2*16]
+ paddd m0, m2
+ paddd m0, m3 ; first 6 x of top y
+
+ ; second line [m0/1 are busy]
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5]
+ punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, [tmp+3*16]
+ pmaddwd m4, [tmp+4*16]
+ pmaddwd m3, [tmp+5*16]
+ pmaddwd m5, m6
+ paddd m1, m4
+ paddd m3, m5
+ paddd m0, m1
+ paddd m0, m3 ; top 2 lines
+
+ ; third line [m0 is busy] & luma + round
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+%if %2
+ movu m5, [bufyq+xq*4]
+%if %3
+ movu m4, [bufyq+xq*4+82*2]
+ phaddw m5, m4
+%else
+ phaddw m5, m5
+%endif
+%else
+ movq m5, [bufyq+xq*2]
+%endif
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+%if %3
+ pshufd m4, m5, q3232
+ paddw m5, m4
+%endif
+%if %2
+%if cpuflag(sse4)
+ pmulhrsw m5, m11
+%elif %3
+ pmulhrsw m5, [base+pw_8192]
+%else
+ pmulhrsw m5, [base+pw_16384]
+%endif
+%endif
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, m5
+ pmaddwd m1, m7
+ pmaddwd m3, m8
+ pmaddwd m4, m9
+ pmaddwd m2, m10
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m12 ; += round
+ paddd m1, m4
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m14
+ pmaxsw m2, m15
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m11
+ pandn m3, m11, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+
+ mov r0m, r0
+ mov r2m, r1
+ mov r4m, r2
+ mov r6m, r3
+ mov r7m, r4
+ mov r8m, r5
+%else
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov scalingq, r5m
+ mov fg_dataq, r3m
+%if STACK_ALIGNMENT < mmsize
+ mov r6, r9m
+
+%define r9m [rsp+8*mmsize+ 4*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+
+ mov r9m, r6
+%endif
+ LEA r5, $$
+%define base r5-$$
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 3
+%else
+ DECLARE_REG_TMP 9, 10
+%endif
+ mov t0d, r9m ; bdmax
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t0d, [t0d*3]
+ lea t0d, [r6d*2+t0d]
+ SPLATW m4, [base+max+t0*2]
+ SPLATW m2, r9m
+
+ pcmpeqw m1, m1
+ psraw m7, m2, 1 ; max_grain
+ pxor m1, m7 ; min_grain
+ SPLATD m6, [base+pd_16]
+
+ SCRATCH 1, 9, 0
+ SCRATCH 2, 10, 1
+ SCRATCH 3, 11, 2
+ SCRATCH 4, 12, 3
+ SCRATCH 5, 13, 4
+ SCRATCH 6, 14, 5
+ SCRATCH 7, 15, 6
+
+ mova m6, [base+pw_27_17_17_27] ; for horizontal filter
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ movzx t0d, byte [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz .no_vertical_overlap
+ test sbyd, sbyd
+ jnz .vertical_overlap
+.no_vertical_overlap:
+ mov dword r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+%endif
+
+.loop_x_odd:
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp ; src += stride
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+%endif
+
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m4, [grain_lutq+r5*2]
+%else
+ movd m4, [grain_lutq+left_offxyq*2]
+%endif
+ punpcklwd m4, m5
+ pmaddwd m4, m6
+ paddd m4, m14
+ psrad m4, 5
+ packssdw m4, m4
+ pminsw m4, m15
+ pmaxsw m4, m9
+ shufps m4, m5, q3210
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ movu m5, [grain_lutq+offxyq*2+16]
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ SPLATD m7, [base+pw_27_17_17_27]
+ mov seed, r3m
+%else
+ SPLATD m7, [pw_27_17_17_27]
+%endif
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+1*gprsize]
+ movu m2, [grain_lutq+r5*2]
+%else
+ movu m2, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ REPX {pmaddwd x, m7}, m4, m2
+ REPX {paddd x, m14}, m4, m2
+ REPX {psrad x, 5}, m4, m2
+ packssdw m2, m4
+ pminsw m2, m15
+ pmaxsw m2, m9
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m3, [grain_lutq+r5*2+16]
+%else
+ movu m3, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m7}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m3, m5
+ pminsw m3, m15
+ pmaxsw m3, m9
+
+ ; src
+ pand m0, m10, [srcq+ 0] ; m0-1: src as word
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m4, m2
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2
+%else
+ vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk_v
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ jmp .loop_x_odd_v_overlap
+
+.next_blk_v:
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r0, [rsp+8*mmsize+1*gprsize]
+ add r3, 16
+ add r0, 16
+ mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
+ mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
+
+ mov seed, r3m
+ xor r0, r0
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ movu m4, [grain_lutq+r0*2]
+ movd m5, [grain_lutq+r5*2]
+ mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
+ movd m3, [grain_lutq+r5*2]
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+ movd m5, [grain_lutq+left_offxyq*2]
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd m5, m2
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m6}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m5, m3
+ pminsw m5, m15
+ pmaxsw m5, m9
+ shufps m3, m5, m2, q3210
+ shufps m5, m4, q3232
+ ; followed by v interpolation (top | cur -> cur)
+ movu m0, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m1, [grain_lutq+r0*2+16]
+%else
+ movu m1, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpcklwd m2, m5, m3
+ punpckhwd m5, m3
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ REPX {pmaddwd x, m7}, m2, m5, m3, m1
+ REPX {paddd x, m14}, m2, m5, m3, m1
+ REPX {psrad x, 5}, m2, m5, m3, m1
+ packssdw m2, m5
+ packssdw m3, m1
+ REPX {pminsw x, m15}, m2, m3
+ REPX {pmaxsw x, m9}, m2, m3
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m2, m4
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+ or dword r8m, 4
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov r5, r5m
+ add offxyd, 16
+ add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ add offxyd, 16
+ add top_offxyd, 16
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+%if ARCH_X86_32
+ DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+%endif
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r4, r3m
+ mov r3, r4m
+ mov r5, r5m
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r1m [rsp+8*mmsize+ 4*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+ mov r0m, r0
+ mov r2m, r2
+ mov r4m, r3
+ mov r5m, r5
+
+ mov r0, r6m
+ mov r2, r7m
+ mov r3, r8m
+ mov r5, r9m
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+%define r9m [rsp+8*mmsize+12*gprsize]
+ mov r6m, r0
+ mov r7m, r2
+ mov r8m, r3
+ mov r9m, r5
+
+ mov r2, r10m
+ mov r3, r11m
+ mov r5, r12m
+ mov r0, r13m
+%define r10m [rsp+8*mmsize+13*gprsize]
+%define r11m [rsp+8*mmsize+14*gprsize]
+%define r12m [rsp+8*mmsize+15*gprsize]
+ mov r10m, r2
+ mov r11m, r3
+ mov r12m, r5
+
+ SPLATW m2, r13m
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov srcq, srcm
+ mov fg_dataq, r3m
+%endif
+ LEA r5, $$
+%define base r5-$$
+
+ DECLARE_REG_TMP 0, 2, 3
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+
+ DECLARE_REG_TMP 9, 10, 11
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if STACK_ALIGNMENT >= mmsize
+ mov t0d, r13m ; bdmax
+%endif
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t1d, [t0d*3]
+ mov t2d, r12m
+ inc t2d
+ imul r6d, t2d
+ add t1d, r6d
+ SPLATW m4, [base+max+t1*2]
+%if STACK_ALIGNMENT >= mmsize
+ SPLATW m2, r13m
+%endif
+
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+
+%define mzero m7
+
+%if %3
+ SPLATD m2, [base+pw_23_22]
+%endif
+
+%if ARCH_X86_32
+ mov scalingq, r5m
+ mov r5m, r5
+%else
+ mov r13mp, strideq
+%endif
+
+ pcmpeqw m0, m0
+ psraw m1, m10, 1
+ pxor m0, m1
+
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+ DECLARE_REG_TMP 9
+%endif
+
+%if %1
+ mov r6d, r11m
+ SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4]
+ SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklwd m6, m1, m0
+ SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4]
+ SPLATD m7, [base+pw_4+t0*4]
+ pmullw m5, m7
+%else
+ SPLATD m6, [base+pd_16]
+%if %2
+ mova m5, [base+pw_23_22]
+%else
+ mova m5, [base+pw_27_17_17_27]
+%endif
+%endif
+
+ SCRATCH 6, 14, 6
+ SCRATCH 5, 15, 7
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0
+%else
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ mov t0d, [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz %%no_vertical_overlap
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+
+%%no_vertical_overlap:
+ mov r8m, t0d
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4mp, wq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+%endif
+
+%if %2 == 0
+%%loop_x_odd:
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16] ; m0-1: src as word
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m3, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m6, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m3, m5
+ pmulhrsw m4, m3
+ pmulhrsw m6, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
+
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0m, dstq
+ mov r9m, lumaq
+ mov r4m, wq
+%endif
+%if %2 == 0
+ btc dword r8m, 2
+ jc %%next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%%next_blk:
+%endif
+ test dword r8m, 1
+ je %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2+ 0]
+%endif
+ punpcklwd m5, m7 ; {left0, cur0}
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ pmaddwd m5, [PIC_ptr(pw_23_22)]
+%else
+ pmaddwd m5, [PIC_ptr(pw_27_17_17_27)]
+%endif
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ pmaddwd m5, m15
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m7, q3210
+ movu m3, [grain_lutq+offxyq*2+16]
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m4
+ pmulhrsw m5, m7
+ pmulhrsw m3, m4
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m5
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov r3m, seed
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3, unused4, unused5, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %2 == 0
+%%loop_x_odd_v_overlap:
+%endif
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ movu m5, [grain_lutq+r0*2]
+%else
+ movu m5, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m5, [grain_lutq+r0*2+16]
+%else
+ movu m5, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m7, m5, m4
+ punpcklwd m5, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m4, m5, m7
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m5, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m5, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m5, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m5, m0
+ punpcklwd m5, m0
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ punpckhwd m7, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m5, m6
+ REPX {pmaxsw x, mzero}, m5, m6
+ REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m5, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m5
+ pmulhrsw m3, m7
+ pmulhrsw m4, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+ dec hw
+ jle %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+%if %3
+ jmp %%loop_y
+%else
+ btc hd, 16
+ jc %%loop_y
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_v_overlap
+%endif
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+ btc dword r8m, 2
+ jc %%loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
+
+ mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ add offxyd, 16
+ add t0d, 16
+ mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
+ mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2]
+%endif
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+2*gprsize]
+ movu m4, [grain_lutq+r0*2]
+%if %2
+ pinsrw m5, [grain_lutq+r5*2], 2
+%else
+ movd m3, [grain_lutq+r5*2]
+%endif
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
+%else
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+%endif
+%if %2 == 0
+ punpckldq m5, m3
+%endif
+ punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
+ punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ movddup m0, [PIC_ptr(pw_23_22)]
+%else
+ movddup m0, [PIC_ptr(pw_27_17_17_27)]
+%endif
+%else
+ pshufd m0, m15, q1010
+%endif
+ pmaddwd m5, m0
+%if %1
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
+ shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
+ shufps m5, m4, q3231 ; top0-7 post-h_filter
+
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
+%else
+ REPX {paddd x, m14}, m5, m7
+%endif
+ REPX {psrad x, 5}, m5, m7
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; right half
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m0, [grain_lutq+r0*2+16]
+%else
+ movu m0, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m1, m0
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
+%else
+ REPX {paddd x, m14}, m1, m0
+%endif
+ REPX {psrad x, 5}, m1, m0
+ packssdw m4, m0, m1
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m6, [lumaq+ 0]
+ mova m5, [lumaq+(16<<%2)]
+%if %2
+ phaddw m6, [lumaq+16]
+ phaddw m5, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m6, mzero
+ pavgw m5, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m6, m0
+ punpcklwd m6, m0
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ punpckhwd m7, m5, m1
+ punpcklwd m5, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m6, m5
+ REPX {pmaxsw x, mzero}, m6, m5
+ REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m6, m5
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1
+%else
+%if %3 == 0
+ ; register shortage :)
+ push r12
+%endif
+ vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1
+%if %3 == 0
+ pop r12
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m6
+ pmulhrsw m3, m7
+ pmulhrsw m4, m6
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+ btc hd, 16
+ jc %%loop_y_h_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_hv_overlap
+%%end_y_hv_overlap:
+%endif
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov dstmp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
new file mode 100644
index 0000000000..55445cf593
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -0,0 +1,2107 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
+gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+; note: the order of (some of) the following constants matter
+pb_27_17: times 2 db 27, 17
+byte_blend: db 0, 0, 0, -1
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_17_27: times 2 db 17, 27
+pb_1: times 4 db 1
+pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32
+next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+fg_min: times 4 db 0
+ times 4 db 16
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+pd_m65536: dd -65536
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+pw_1: dw 1
+
+%macro JMP_TABLE 2-*
+ %1_8bpc_%2_table:
+ %xdefine %%base %1_8bpc_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %rep %0 - 2
+ dd %%prefix %+ .ar%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
+%define base r4-generate_grain_y_8bpc_avx2_table
+ lea r4, [generate_grain_y_8bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movq xm4, [base+mul_bits]
+ movq xm5, [base+hmul_bits]
+ mov r7, -73*82
+ mova xm6, [base+pb_mask]
+ sub bufq, r7
+ vpbroadcastw xm7, [base+round+r6*2]
+ lea r6, [gaussian_sequence]
+ movsxd r5, [r4+r5*4]
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm3, xm2 ; 4 next output seeds
+ pshuflw xm0, xm3, q3333
+ psrlw xm3, 5
+ pand xm2, xm0, xm1
+ movq r2, xm3
+ psrlw xm3, xm2, 10
+ por xm2, xm3
+ pmullw xm2, xm4
+ pmulhuw xm0, xm5
+ movzx r3d, r2w
+ pshufb xm3, xm6, xm2
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm0, xm2
+ movd xm2, [r6+r3*2]
+ rorx r3, r2, 32
+ por xm3, xm0
+ shr r2d, 16
+ pinsrw xm2, [r6+r2*2], 1
+ pshuflw xm0, xm3, q3333
+ movzx r2d, r3w
+ psrlw xm3, 5
+ pinsrw xm2, [r6+r2*2], 2
+ shr r3d, 16
+ movq r2, xm3
+ pinsrw xm2, [r6+r3*2], 3
+ movzx r3d, r2w
+ pinsrw xm2, [r6+r3*2], 4
+ rorx r3, r2, 32
+ shr r2d, 16
+ pinsrw xm2, [r6+r2*2], 5
+ movzx r2d, r3w
+ pinsrw xm2, [r6+r2*2], 6
+ shr r3d, 16
+ pinsrw xm2, [r6+r3*2], 7
+ pmulhrsw xm2, xm7
+ packsswb xm2, xm2
+ movq [bufq+r7], xm2
+ add r7, 8
+ jl .loop
+
+ ; auto-regression code
+ add r5, r4
+ jmp r5
+
+.ar1:
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd xm5, [fg_dataq+FGData.ar_coeffs_y]
+ mova xm2, [base+gen_shufC]
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+ pinsrb xm5, [base+pb_1], 3
+ vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd
+ pmovsxbw xm5, xm5
+ pshufd xm4, xm5, q0000
+ pshufd xm5, xm5, q1111
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ pmovsxbw xm1, [bufq+xq-82-3]
+ pshufb xm0, xm1, xm2
+ punpckhwd xm1, xm3
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ movsx val0d, byte [bufq+xq]
+ sarx val3d, val3d, shiftd
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 168
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+ movaps [rsp+16*7], xmm13
+ movaps [rsp+16*8], xmm14
+ movaps [rsp+16*9], xmm15
+%endif
+ DEFINE_ARGS buf, fg_data, h, x
+ mov r6d, [fg_dataq+FGData.ar_coeff_shift]
+ pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ vpbroadcastd xm10, [base+round_vals-14+r6*2]
+ movd xm11, [base+byte_blend+1]
+ pmovsxbw xm9, xm9
+ pshufd xm4, xm7, q0000
+ mova xm12, [base+gen_shufA]
+ pshufd xm5, xm7, q3333
+ mova xm13, [base+gen_shufB]
+ pshufd xm6, xm7, q1111
+ mova xm14, [base+gen_shufC]
+ pshufd xm7, xm7, q2222
+ mova xm15, [base+gen_shufD]
+ pshufd xm8, xm9, q0000
+ psrld xm10, 16
+ pshufd xm9, xm9, q1111
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+.x_loop_ar2:
+ pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pshufb xm2, xm0, xm12
+ pmaddwd xm2, xm4
+ pshufb xm3, xm1, xm13
+ pmaddwd xm3, xm5
+ paddd xm2, xm3
+ pshufb xm3, xm0, xm14
+ pmaddwd xm3, xm6
+ punpckhqdq xm0, xm0
+ punpcklwd xm0, xm1
+ pmaddwd xm0, xm7
+ pshufb xm1, xm15
+ pmaddwd xm1, xm8
+ paddd xm2, xm10
+ paddd xm2, xm3
+ paddd xm0, xm1
+ paddd xm2, xm0
+ movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pmovsxbw xm1, xm0
+ pmaddwd xm3, xm9, xm1
+ psrldq xm1, 4 ; y=0,x=0
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw xm3, xm1
+ packsswb xm3, xm3
+ pextrb [bufq+xq], xm3, 0
+ pslldq xm3, 2
+ vpblendvb xm0, xm3, xm11
+ psrldq xm0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+INIT_YMM avx2
+.ar3:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign stack_offset 16
+ ALLOC_STACK 16*14
+ %assign stack_size stack_size - 16*4
+ %assign xmm_regs_used 12
+ movaps [rsp+16*12], xmm8
+ movaps [rsp+16*13], xmm9
+ movaps [rsp+16*14], xmm10
+ movaps [rsp+16*15], xmm11
+%else
+ ALLOC_STACK 16*12
+%endif
+ mov r6d, [fg_dataq+FGData.ar_coeff_shift]
+ movd xm11, [base+byte_blend]
+ pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pshufd m0, m1, q0000
+ mova [rsp+16* 0], m0
+ pshufd m0, m1, q1111
+ mova [rsp+16* 2], m0
+ pshufd m0, m1, q2222
+ mova [rsp+16* 4], m0
+ pshufd m1, m1, q3333
+ mova [rsp+16* 6], m1
+ pshufd xm0, xm2, q0000
+ mova [rsp+16* 8], xm0
+ pshufd xm0, xm2, q1111
+ mova [rsp+16* 9], xm0
+ psrldq xm7, xm2, 10
+ mova m8, [base+gen_shufA]
+ pinsrw xm2, [base+pw_1], 5
+ mova m9, [base+gen_shufC]
+ pshufd xm2, xm2, q2222
+ movu m10, [base+gen_shufE]
+ vpbroadcastw xm6, [base+round_vals-12+r6*2]
+ pinsrw xm7, [base+round_vals+r6*2-10], 3
+ mova [rsp+16*10], xm2
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+.x_loop_ar3:
+ movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
+ movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ punpcklbw m3, m5, m5
+ punpckhwd m5, m4
+ psraw m3, 8
+ punpcklbw m5, m5
+ psraw m5, 8
+ punpcklbw xm4, xm4
+ psraw xm4, 8
+ pshufb m0, m3, m8
+ pmaddwd m0, [rsp+16*0]
+ pshufb m1, m3, m9
+ pmaddwd m1, [rsp+16*2]
+ shufps m2, m3, m5, q1032
+ paddd m0, m1
+ pshufb m1, m2, m8
+ vperm2i128 m3, m4, 0x21
+ pmaddwd m1, [rsp+16*4]
+ shufps xm2, xm3, q1021
+ vpblendd m2, m3, 0xf0
+ pshufb m2, m10
+ paddd m0, m1
+ pmaddwd m2, [rsp+16*6]
+ pshufb xm1, xm4, xm9
+ pmaddwd xm1, [rsp+16*8]
+ shufps xm4, xm5, q1132
+ paddd m0, m2
+ pshufb xm2, xm4, xm8
+ pshufd xm4, xm4, q2121
+ pmaddwd xm2, [rsp+16*9]
+ punpcklwd xm4, xm6
+ pmaddwd xm4, [rsp+16*10]
+ vextracti128 xm3, m0, 1
+ paddd xm0, xm1
+ movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ paddd xm2, xm4
+ paddd xm0, xm2
+ paddd xm0, xm3
+.x_loop_ar3_inner:
+ pmovsxbw xm2, xm1
+ pmaddwd xm2, xm7
+ pshufd xm3, xm2, q1111
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb xm2, xm2
+ pextrb [bufq+xq], xm2, 0
+ pslldq xm2, 3
+ vpblendvb xm1, xm2, xm11
+ psrldq xm1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
+%define base r4-generate_grain_uv_%1_8bpc_avx2_table
+ lea r4, [generate_grain_uv_%1_8bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ movq xm4, [base+mul_bits]
+ movq xm5, [base+hmul_bits]
+ mova xm6, [base+pb_mask]
+ vpbroadcastw xm7, [base+round+r6*2]
+ vpbroadcastd xm2, [base+pw_seed_xor+uvq*4]
+ pxor xm0, xm2
+ lea r6, [gaussian_sequence]
+%if %2
+ mov r7d, 73-35*%3
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+%else
+ mov r5, -73*82
+ sub bufq, r5
+%endif
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm2, xm3 ; 4 next output seeds
+ pshuflw xm0, xm2, q3333
+ psrlw xm2, 5
+ movq r8, xm2
+ movzx r9d, r8w
+ movd xm2, [r6+r9*2]
+ rorx r9, r8, 32
+ shr r8d, 16
+ pinsrw xm2, [r6+r8*2], 1
+ movzx r8d, r9w
+ pinsrw xm2, [r6+r8*2], 2
+ shr r9d, 16
+ pinsrw xm2, [r6+r9*2], 3
+ pmulhrsw xm2, xm7
+ packsswb xm2, xm2
+ movd [bufq+r5], xm2
+ add r5, 4
+ jl .loop
+%if %2
+ add bufq, 82
+ dec r7d
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r6, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
+ add r6, r4
+ jmp r6
+
+INIT_YMM avx2
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd xm3, [base+hmul_bits+shiftq*2]
+ DEFINE_ARGS buf, bufy, h
+ pmovsxbw xm2, xm2
+%if %2
+ vpbroadcastd m7, [base+pb_1]
+ vpbroadcastw m6, [base+hmul_bits+2+%3*2]
+%endif
+ vpbroadcastw m2, xm2
+ vpbroadcastw m3, xm3
+ pxor m12, m12
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
+ add bufyq, 3+82*3
+ mov hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+ ; first 32 pixels
+ movu xm4, [bufyq]
+ vinserti128 m4, [bufyq+32], 1
+%if %3
+ movu xm0, [bufyq+82]
+ vinserti128 m0, [bufyq+82+32], 1
+%endif
+ movu xm5, [bufyq+16]
+ vinserti128 m5, [bufyq+48], 1
+%if %3
+ movu xm1, [bufyq+82+16]
+ vinserti128 m1, [bufyq+82+48], 1
+%endif
+ pmaddubsw m4, m7, m4
+%if %3
+ pmaddubsw m0, m7, m0
+%endif
+ pmaddubsw m5, m7, m5
+%if %3
+ pmaddubsw m1, m7, m1
+ paddw m4, m0
+ paddw m5, m1
+%endif
+ pmulhrsw m4, m6
+ pmulhrsw m5, m6
+%else
+ xor r3d, r3d
+ ; first 32x2 pixels
+.x_loop_ar0:
+ movu m4, [bufyq+r3]
+ pcmpgtb m0, m12, m4
+ punpckhbw m5, m4, m0
+ punpcklbw m4, m0
+%endif
+ pmullw m4, m2
+ pmullw m5, m2
+ pmulhrsw m4, m3
+ pmulhrsw m5, m3
+%if %2
+ movu m1, [bufq]
+%else
+ movu m1, [bufq+r3]
+%endif
+ pcmpgtb m8, m12, m1
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m0, m1
+%if %2
+ movu [bufq], m0
+%else
+ movu [bufq+r3], m0
+ add r3d, 32
+ cmp r3d, 64
+ jl .x_loop_ar0
+%endif
+
+ ; last 6/12 pixels
+ movu xm4, [bufyq+32*2]
+%if %2
+%if %3
+ movu xm5, [bufyq+32*2+82]
+%endif
+ pmaddubsw xm4, xm7, xm4
+%if %3
+ pmaddubsw xm5, xm7, xm5
+ paddw xm4, xm5
+%endif
+ movq xm0, [bufq+32]
+ pmulhrsw xm4, xm6
+ pmullw xm4, xm2
+ pmulhrsw xm4, xm3
+ pcmpgtb xm5, xm12, xm0
+ punpcklbw xm5, xm0, xm5
+ paddw xm4, xm5
+ packsswb xm4, xm4
+ pblendw xm0, xm4, xm0, 1000b
+ movq [bufq+32], xm0
+%else
+ movu xm0, [bufq+64]
+ pcmpgtb xm1, xm12, xm4
+ punpckhbw xm5, xm4, xm1
+ punpcklbw xm4, xm1
+ pmullw xm5, xm2
+ pmullw xm4, xm2
+ vpblendd xm1, xm3, xm12, 0x0c
+ pmulhrsw xm5, xm1
+ pmulhrsw xm4, xm3
+ pcmpgtb xm1, xm12, xm0
+ punpckhbw xm8, xm0, xm1
+ punpcklbw xm0, xm1
+ paddw xm5, xm8
+ paddw xm0, xm4
+ packsswb xm0, xm5
+ movu [bufq+64], xm0
+%endif
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+INIT_XMM avx2
+.ar1:
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+ DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
+ vpbroadcastd xm7, [base+pb_1]
+ vpbroadcastw xm6, [base+hmul_bits+2+%3*2]
+%endif
+ vpbroadcastd xm3, xm3
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ pmovsxbw xm0, [bufq+xq-82-1] ; top/left
+%if %2
+ movq xm8, [bufyq+xq*2]
+%if %3
+ movq xm9, [bufyq+xq*2+82]
+%endif
+%endif
+ psrldq xm2, xm0, 2 ; top
+ psrldq xm1, xm0, 4 ; top/right
+%if %2
+ pmaddubsw xm8, xm7, xm8
+%if %3
+ pmaddubsw xm9, xm7, xm9
+ paddw xm8, xm9
+%endif
+ pmulhrsw xm8, xm6
+%else
+ pmovsxbw xm8, [bufyq+xq]
+%endif
+ punpcklwd xm0, xm2
+ punpcklwd xm1, xm8
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+ paddd xm0, xm3
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vpbroadcastw xm13, [base+round_vals-12+shiftq*2]
+ pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
+ pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
+ pinsrw xm0, [base+pw_1], 5
+%if %2
+ vpbroadcastw xm12, [base+hmul_bits+2+%3*2]
+ vpbroadcastd xm11, [base+pb_1]
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd xm4, xm7, q0000
+ pshufd xm5, xm7, q3333
+ pshufd xm6, xm7, q1111
+ pshufd xm7, xm7, q2222
+ pshufd xm8, xm0, q0000
+ pshufd xm9, xm0, q1111
+ pshufd xm10, xm0, q2222
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pshufb xm2, xm0, [base+gen_shufA]
+ pmaddwd xm2, xm4
+ pshufb xm3, xm1, [base+gen_shufB]
+ pmaddwd xm3, xm5
+ paddd xm2, xm3
+ pshufb xm3, xm0, [base+gen_shufC]
+ pmaddwd xm3, xm6
+ punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5]
+ punpcklwd xm0, xm1
+ pmaddwd xm0, xm7
+ pshufb xm1, [gen_shufD]
+ pmaddwd xm1, xm8
+ paddd xm2, xm3
+ paddd xm0, xm1
+ paddd xm2, xm0
+
+%if %2
+ movq xm0, [bufyq+xq*2]
+%if %3
+ movq xm3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw xm0, xm11, xm0
+%if %3
+ pmaddubsw xm3, xm11, xm3
+ paddw xm0, xm3
+%endif
+ pmulhrsw xm0, xm12
+%else
+ pmovsxbw xm0, [bufyq+xq]
+%endif
+ punpcklwd xm0, xm13
+ pmaddwd xm0, xm10
+ paddd xm2, xm0
+
+ movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pmovsxbw xm0, xm0
+ pmaddwd xm3, xm0, xm9
+ psrldq xm0, 2
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq xm3, 2
+ paddw xm3, xm0
+ pblendw xm0, xm3, 00000010b
+ packsswb xm0, xm0
+ pextrb [bufq+xq], xm0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+INIT_YMM avx2
+.ar3:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
+ vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
+ movd xm13, [base+round_vals-10+shiftq*2]
+ vpbroadcastd xm14, [base+round_vals-14+shiftq*2]
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m8, m0, q2222
+ pshufd m9, m0, q3333
+ pshufd xm10, xm1, q0000
+ pshufd xm11, xm1, q1111
+ pshufhw xm12, xm1, q0000
+ psraw xm2, 8
+ palignr xm13, xm1, 10
+ punpckhwd xm12, xm2 ; interleave luma cf
+ psrld xm14, 16
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ vpbroadcastw xm15, [base+hmul_bits+2+%3*2]
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+.x_loop_ar3:
+ vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12
+ palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
+ vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ vpblendd m3, m1, 0x0f
+ pxor m0, m0
+ pcmpgtb m2, m0, m3
+ pcmpgtb m0, m4
+ punpcklbw m1, m3, m2
+ punpckhbw m3, m2
+ punpcklbw m2, m4, m0
+ punpckhbw xm4, xm0
+ pshufb m0, m1, [base+gen_shufA]
+ pmaddwd m0, m6
+ pshufb m5, m1, [base+gen_shufC]
+ pmaddwd m5, m7
+ shufps m1, m3, q1032
+ paddd m0, m5
+ pshufb m5, m1, [base+gen_shufA]
+ pmaddwd m5, m8
+ shufps xm1, xm3, q2121
+ vpblendd m1, m2, 0xf0
+ pshufb m1, [base+gen_shufE]
+ pmaddwd m1, m9
+ paddd m0, m5
+ pshufb xm3, xm2, [base+gen_shufC]
+ paddd m0, m1
+ pmaddwd xm3, xm10
+ palignr xm1, xm4, xm2, 2
+ punpckhwd xm1, xm2, xm1
+ pmaddwd xm1, xm11
+ palignr xm4, xm2, 12
+ paddd xm3, xm1
+%if %2
+ vpbroadcastd xm5, [base+pb_1]
+ movq xm1, [bufyq+xq*2]
+ pmaddubsw xm1, xm5, xm1
+%if %3
+ movq xm2, [bufyq+xq*2+82]
+ pmaddubsw xm5, xm2
+ paddw xm1, xm5
+%endif
+ pmulhrsw xm1, xm15
+%else
+ pmovsxbw xm1, [bufyq+xq]
+%endif
+ punpcklwd xm4, xm1
+ pmaddwd xm4, xm12
+ movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm14
+ paddd xm3, xm4
+ paddd xm0, xm3
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmovsxbw xm1, xm1
+ pmaddwd xm2, xm13, xm1
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ psrldq xm1, 2
+ ; don't packssdw, we only care about one value
+ punpckldq xm2, xm2
+ pblendw xm1, xm2, 0100b
+ packsswb xm1, xm1
+ pextrb [bufq+xq], xm1, 2
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r9-pd_m65536
+ lea r9, [pd_m65536]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ vpbroadcastd m8, [base+pd_m65536]
+ vpbroadcastw m9, [base+mul_bits+r6*2-14]
+ vpbroadcastd m10, [base+fg_min+r7*4]
+ vpbroadcastd m11, [base+fg_max+r7*8]
+ vpbroadcastd m12, [base+pw_1024]
+ movq xm13, [base+pb_27_17_17_27]
+ test sbyd, sbyd
+ setnz r7b
+ pxor m7, m7
+ test r7b, overlapb
+ jnz .vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, overlap
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq]
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm5
+ pmaddubsw xm4, xm13, xm4
+ pmulhrsw xm4, xm12
+ packsswb xm4, xm4
+ vpblendd m4, m5, 0xfe
+ punpckhbw m5, m7
+ punpcklbw m4, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne .loop_x_hv_overlap
+ jmp .loop_x_h_overlap
+
+.vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused, sby, see, overlap
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+.loop_x_v_overlap:
+ vpbroadcastd m14, [pb_27_17]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_v_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+ punpcklbw m5, m4, m6
+ punpckhbw m4, m6
+ pmaddubsw m5, m14, m5
+ pmaddubsw m4, m14, m4
+ pmulhrsw m5, m12
+ pmulhrsw m4, m12
+ packsswb m5, m4
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hb
+ jz .end_y_v_overlap
+ vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+.end_y_v_overlap:
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+.loop_x_hv_overlap:
+ vpbroadcastd m14, [pb_27_17]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_hv_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq]
+ movd xm7, [grain_lutq+left_offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+ movd xm5, [grain_lutq+topleft_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw xm7, xm6
+ punpcklbw xm5, xm4
+ pmaddubsw xm7, xm13, xm7
+ pmaddubsw xm5, xm13, xm5
+ pmulhrsw xm7, xm12
+ pmulhrsw xm5, xm12
+ packsswb xm7, xm7
+ packsswb xm5, xm5
+ vpblendd m7, m6, 0xfe
+ vpblendd m5, m4, 0xfe
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhbw m4, m6
+ punpcklbw m5, m7
+ pmaddubsw m4, m14, m4
+ pmaddubsw m5, m14, m5
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ pxor m7, m7
+ packsswb m5, m4
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hb
+ jz .end_y_hv_overlap
+ vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+.end_y_hv_overlap:
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .loop_x_hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, overlap, uv_pl, is_id
+%define base r11-pd_m65536
+ lea r11, [pd_m65536]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ vpbroadcastd m8, [base+pd_m65536]
+ vpbroadcastw m9, [base+mul_bits+r6*2-14]
+ vpbroadcastd m10, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m11, [base+fg_max+r7*4]
+ vpbroadcastd m12, [base+pw_1024]
+ pxor m7, m7
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, sby, see, overlap, uv_pl
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m0, [base+pw_8]
+ vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m0 ; uv_luma_mult, uv_mult
+%elif %2
+ vpbroadcastq m15, [base+pb_23_22]
+%else
+ vpbroadcastq xm15, [base+pb_27_17_17_27]
+%endif
+%if %3
+ vpbroadcastw m13, [base+pb_23_22]
+%elif %2
+ pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
+%endif
+ test r7b, overlapb
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, unused5, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*(1+%2)]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm5, [grain_lutq+offxyq+ 0]
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+%else
+ movu m5, [grain_lutq+offxyq]
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+ jg %%loop_y
+
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm5, [grain_lutq+offxyq+ 0]
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+ movd xm4, [grain_lutq+left_offxyq+ 0]
+ vinserti128 m4, [grain_lutq+left_offxyq+82], 1
+ punpcklbw m4, m5
+%if %1
+ vpbroadcastq m0, [pb_23_22]
+ pmaddubsw m4, m0, m4
+%else
+ pmaddubsw m4, m15, m4
+%endif
+ pmulhrsw m4, m12
+ packsswb m4, m4
+ vpblendd m4, m5, 0xee
+%else
+ movu m5, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm5
+%if %1
+ movq xm0, [pb_27_17_17_27]
+ pmaddubsw xm4, xm0, xm4
+%else
+ pmaddubsw xm4, xm15, xm4
+%endif
+ pmulhrsw xm4, xm12
+ packsswb xm4, xm4
+ vpblendd m4, m5, 0xfe
+%endif
+ punpckhbw m5, m7
+ punpcklbw m4, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(1+%2)
+ sub hb, 1+%2
+ jg %%loop_y_h_overlap
+
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, overlap, unused1, unused2, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused1, unused2, see, overlap, unused3, unused4, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*(1+%2)]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ vpbroadcastd m13, [pb_27_17]
+%endif
+%%loop_y_v_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+ movu xm0, [grain_lutq+offxyq]
+ vinserti128 m0, [grain_lutq+offxyq+82], 1
+ movu xm4, [grain_lutq+top_offxyq]
+ vinserti128 m4, [grain_lutq+top_offxyq+82], 1
+%else
+ movu m0, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpcklbw m5, m4, m0
+ punpckhbw m4, m0
+ pmaddubsw m5, m13, m5
+ pmaddubsw m4, m13, m4
+ pmulhrsw m5, m12
+ pmulhrsw m4, m12
+ packsswb m5, m4
+%else
+ movq xm4, [grain_lutq+offxyq]
+ vinserti128 m4, [grain_lutq+offxyq+8], 1
+ movq xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+8], 1
+ punpcklbw m5, m4
+ pmaddubsw m5, m13, m5
+ pmulhrsw m5, m12
+ vextracti128 xm4, m5, 1
+ packsswb xm5, xm4
+ ; only interpolate first line, insert second line unmodified
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+ sub hb, 1+%2
+ jle %%end_y_v_overlap
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+%if %2 == 0
+ vpbroadcastd m13, [pb_17_27]
+ add hd, 0x80000000
+ jnc %%loop_y_v_overlap
+%endif
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ vpbroadcastd m13, [pb_27_17]
+%endif
+%%loop_y_hv_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm4, [grain_lutq+offxyq]
+ vinserti128 m4, [grain_lutq+offxyq+82], 1
+ movd xm0, [grain_lutq+left_offxyq]
+ vinserti128 m0, [grain_lutq+left_offxyq+82], 1
+ movd xm6, [grain_lutq+topleft_offxyq]
+%if %3
+ movq xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+8], 1
+%else
+ vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1
+ movu xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+82], 1
+%endif
+
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m0, m4
+%if %3
+ punpcklbw xm6, xm5
+%else
+ punpcklbw m6, m5
+%endif
+ punpcklqdq m0, m6
+%if %1
+ vpbroadcastq m6, [pb_23_22]
+ pmaddubsw m0, m6, m0
+%else
+ pmaddubsw m0, m15, m0
+%endif
+ pmulhrsw m0, m12
+ packsswb m0, m0
+ vpblendd m4, m0, 0x11
+%if %3
+ pshuflw xm0, xm0, q1032
+ vpblendd m5, m0, 0x01
+%else
+ pshuflw m0, m0, q1032
+ vpblendd m5, m0, 0x11
+%endif
+%else
+ movu m4, [grain_lutq+offxyq]
+ movd xm0, [grain_lutq+left_offxyq]
+ movu m5, [grain_lutq+top_offxyq]
+ movd xm6, [grain_lutq+topleft_offxyq]
+ punpcklbw xm0, xm4
+ punpcklbw xm6, xm5
+ punpcklqdq xm0, xm6
+%if %1
+ vpbroadcastq xm6, [pb_27_17_17_27]
+ pmaddubsw xm0, xm6, xm0
+%else
+ pmaddubsw xm0, xm15, xm0
+%endif
+ pmulhrsw xm0, xm12
+ packsswb xm0, xm0
+ vpblendd m4, m0, 0x01
+ pshuflw xm0, xm0, q1032
+ vpblendd m5, m0, 0x01
+%endif
+
+ ; followed by v interpolation (top | cur -> cur)
+%if %3
+ vpermq m0, m4, q3120
+ punpcklbw m5, m0
+ pmaddubsw m5, m13, m5
+ pmulhrsw m5, m12
+ vextracti128 xm0, m5, 1
+ packsswb xm5, xm0
+ vpblendd m5, m4, 0xf0
+%else
+ punpckhbw m0, m5, m4
+ punpcklbw m5, m4
+ pmaddubsw m4, m13, m0
+ pmaddubsw m5, m13, m5
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ packsswb m5, m4
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+%if %2
+ jg %%loop_y_h_overlap
+%else
+ je %%end_y_hv_overlap
+ vpbroadcastd m13, [pb_17_27]
+ add hd, 0x80000000
+ jnc %%loop_y_hv_overlap
+ jmp %%loop_y_h_overlap
+%endif
+
+%%end_y_hv_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%loop_x_hv_overlap
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+GEN_GRAIN_UV_FN 420, 1, 1
+FGUV_FN 420, 1, 1
+GEN_GRAIN_UV_FN 422, 1, 0
+FGUV_FN 422, 1, 0
+GEN_GRAIN_UV_FN 444, 0, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm
new file mode 100644
index 0000000000..317ec118b3
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm
@@ -0,0 +1,813 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32
+pb_27_17: times 2 db 27, 17
+pb_23_22: times 2 db 23, 22
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+pb_17_27: times 2 db 17, 27
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+fg_min: times 4 db 0
+ times 4 db 16
+noise_rnd: times 2 dw 128
+ times 2 dw 64
+ times 2 dw 32
+ times 2 dw 16
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ mov r12, 0x0000000f0000000f ; h_overlap mask
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m12, [base+pb_17_27]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ test sbyd, sbyd
+ setnz r6b
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ vpbroadcastd m8, [base+fg_max+r7*8]
+ pxor m5, m5
+ vpbroadcastd m9, [base+pw_1024]
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+ vmovdqa64 m12{k1}, m16
+ test r6b, overlapb
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu ym21, [grain_lutq+offxyq-82]
+ vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy
+
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd ; previous column's offy*stride
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu ym20, [grain_lutq+offxyq-82]
+ vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1
+ movd xm19, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2
+ punpcklbw m19, m20
+ pmaddubsw m19, m10, m19
+ pmulhrsw m19, m9
+ punpckhbw m21, m20, m5
+ packsswb m20{k1}, m19, m19
+ punpcklbw m20, m5, m20
+ call .add_noise_h
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
+ h, sby, see, overlap
+
+ movzx r6d, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, r6d, 173 * 0x00010001
+ imul r6d, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add r6d, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and r6d, 0xff00ff00
+ xor seed, r7d
+ xor seed, r6d ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ punpckhbw m20, m21, m19
+ punpcklbw m21, m19
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to h+v overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov topleft_offxyd, top_offxyd
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movd xm16, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ movd xm17, [grain_lutq+topleft_offxyq-50]
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m19
+ pmaddubsw m16, m10, m16
+ punpcklbw m17, m21
+ pmaddubsw m17, m10, m17
+ punpckhbw m20, m21, m19
+ pmulhrsw m16, m9
+ pmulhrsw m17, m9
+ packsswb m19{k1}, m16, m16
+ packsswb m21{k1}, m17, m17
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m21, m19
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ pmaddubsw m20, m12, m20
+ pmaddubsw m21, m12, m21
+ pmulhrsw m20, m9
+ pmulhrsw m21, m9
+ packsswb m21, m20
+.add_noise:
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+.add_noise_h:
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ mova m19, m0
+ punpcklbw m16, m18, m5
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpmovb2m k2, m18
+ punpckhbw m17, m18, m5
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
+ scaling, grain_lut, h, sby, luma, \
+ overlap, uv_pl, is_id, _, stride3
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+%if %2
+ mov r12, 0x000f000f000f000f ; h_overlap mask
+ vpbroadcastq m10, [base+pb_23_22_0_32]
+ lea stride3q, [strideq*3]
+%else
+ mov r12, 0x0000000f0000000f
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+%endif
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m8, [base+fg_max+r7*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m9, [base+pw_1024]
+ mova m11, [base+pb_even]
+ mova m12, [base+pb_odd]
+ pxor m5, m5
+ mov r5, r10mp ; lstride
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ h, sby, see, overlap, uv_pl, _, _, stride3
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m16, [base+pw_8]
+ vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m16 ; uv_luma_mult, uv_mult
+%endif
+ test r7b, overlapb
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, _, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1+%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, _, _, _, stride3
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu xm20, [grain_lutq+offxyq +82*0]
+ movd xm19, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1
+ vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3
+%else
+ movu ym20, [grain_lutq+offxyq + 0]
+ movd xm19, [grain_lutq+left_offxyq+ 0]
+ vinserti32x8 m20, [grain_lutq+offxyq +82], 1
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2
+%endif
+ punpcklbw m19, m20
+ pmaddubsw m19, m10, m19
+ punpckhbw m21, m20, m5
+ pmulhrsw m19, m9
+ vpacksswb m20{k1}, m19, m19
+ punpcklbw m20, m5, m20
+ call %%add_noise_h
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ _, sby, see, overlap, _, _, _, stride3
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+%if %3
+ vpbroadcastd m13, [base+pb_23_22]
+ kxnorw k3, k3, k3 ; v_overlap mask
+%elif %2
+ vbroadcasti32x8 m13, [base+pb_27_17]
+ kxnord k3, k3, k3
+ pshufd m13, m13, q0000 ; 8x27_17, 8x17_27
+%else
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m13, [base+pb_17_27]
+ vmovdqa64 m13{k1}, m16
+%endif
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, top_offxy, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1<<%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, top_offxy, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu xm18, [grain_lutq+offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq+82*0]
+ ; only interpolate first line, insert remaining line unmodified
+ vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ punpcklbw xm19, xm20, xm18
+ punpckhbw xm20, xm18
+%elif %2
+ movu xm18, [grain_lutq+offxyq+82*0]
+ vinserti128 ym18, [grain_lutq+offxyq+82*1], 1
+ movu xm20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1
+ vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ punpcklbw ym19, ym20, ym18
+ punpckhbw ym20, ym18
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m21
+%if %3
+ punpcklbw xm18, xm20
+%else
+ vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1
+ vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1
+ punpcklbw ym18, ym20
+%endif
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vmovdqu8 m21{k1}, m16
+%if %3
+ vpalignr xm20{k1}, xm16, xm16, 4
+ punpcklbw xm19, xm20, xm21
+ punpckhbw xm20, xm21
+%else
+ vpalignr ym20{k1}, ym16, ym16, 4
+ punpcklbw ym19, ym20, ym21
+ punpckhbw ym20, ym21
+%endif
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2
+ punpcklbw m16, m21
+ punpcklbw m18, m20
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vpalignr m20{k1}, m16, m16, 4
+ vmovdqu8 m21{k1}, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%hv_overlap
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ pmaddubsw xm19, xm13, xm19
+ pmaddubsw xm20, xm13, xm20
+ pmulhrsw xm19, xm9
+ pmulhrsw xm20, xm9
+ vpacksswb m21{k3}, m19, m20
+%elif %2
+ pmaddubsw ym19, ym13, ym19
+ pmaddubsw ym20, ym13, ym20
+ pmulhrsw ym19, ym9
+ pmulhrsw ym20, ym9
+ vpacksswb m21{k3}, m19, m20
+%else
+ punpcklbw m19, m20, m21
+ punpckhbw m20, m21
+ pmaddubsw m19, m13, m19
+ pmaddubsw m20, m13, m20
+ pmulhrsw m19, m9
+ pmulhrsw m20, m9
+ packsswb m21, m19, m20
+%endif
+%%add_noise:
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+%%add_noise_h:
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+%if %2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova ym16, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1
+ mova xm17, [srcq+strideq*0]
+ mova m19, m11
+ vpermi2b m19, m18, m16
+ vinserti128 ym17, [srcq+strideq*1], 1
+ vpermt2b m18, m12, m16
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+%else
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%endif
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+.add_noise_main:
+ mova m19, m0
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpmovb2m k2, m18
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*(2<<%2)]
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ ret
+%else
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm
new file mode 100644
index 0000000000..74f7044e66
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_common.asm
@@ -0,0 +1,46 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+struc FGData
+ .seed: resd 1
+ .num_y_points: resd 1
+ .y_points: resb 14 * 2
+ .chroma_scaling_from_luma: resd 1
+ .num_uv_points: resd 2
+ .uv_points: resb 2 * 10 * 2
+ .scaling_shift: resd 1
+ .ar_coeff_lag: resd 1
+ .ar_coeffs_y: resb 24
+ .ar_coeffs_uv: resb 2 * 28 ; includes padding
+ .ar_coeff_shift: resq 1
+ .grain_scale_shift: resd 1
+ .uv_mult: resd 2
+ .uv_luma_mult: resd 2
+ .uv_offset: resd 2
+ .overlap_flag: resd 1
+ .clip_to_restricted_range: resd 1
+endstruc
+
+cextern gaussian_sequence
diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
new file mode 100644
index 0000000000..0172f98760
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -0,0 +1,3233 @@
+; Copyright © 2019-2021, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA
+
+pw_1024: times 8 dw 1024
+pb_27_17_17_27: db 27, 17, 17, 27
+ times 6 db 0, 32
+pb_23_22_h: db 23, 22
+ times 7 db 0, 32
+pb_27_17: times 8 db 27, 17
+pb_17_27: times 8 db 17, 27
+pb_23_22: times 8 db 23, 22
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+max: dw 255, 240, 235
+min: dw 0, 16
+pw_1: dw 1
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_8bpc_%2_table %%table
+ %xdefine %%base %1_8bpc_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .ar%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r2d, [fg_dataq+FGData.grain_scale_shift]
+ movd m2, [base+round+r2*2]
+ movd m0, [fg_dataq+FGData.seed]
+ mova m5, [base+pb_mask]
+ pshuflw m2, m2, q0000
+ pshuflw m0, m0, q0000
+ mov r2, -73*82
+ sub bufq, r2
+ lea r3, [base+gaussian_sequence]
+.loop:
+ pand m6, m0, m1
+ psrlw m3, m6, 10
+ por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m6, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m6 ; set 15th bit for next 4 seeds
+ psllq m6, m3, 30
+ por m3, m6
+ psllq m6, m3, 15
+ por m3, m6 ; aggregate each bit into next seed's high bit
+ pmulhuw m6, m0, m7
+ por m3, m6 ; 4 next output seeds
+ pshuflw m0, m3, q3333
+ psrlw m3, 5
+%if ARCH_X86_64
+ movq r6, m3
+ mov r8, r6
+ movzx r5d, r6w
+ shr r6d, 16
+ shr r8, 32
+ movzx r7, r8w
+ shr r8, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+ pinsrw m6, [r3+r7*2], 2
+ pinsrw m6, [r3+r8*2], 3
+%else
+ movd r6, m3
+ pshuflw m3, m3, q3232
+ movzx r5, r6w
+ shr r6, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+
+ movd r6, m3
+ movzx r5, r6w
+ shr r6, 16
+
+ pinsrw m6, [r3+r5*2], 2
+ pinsrw m6, [r3+r6*2], 3
+%endif
+ pmulhrsw m6, m2
+ packsswb m6, m6
+ movd [bufq+r2], m6
+ add r2, 4
+ jl .loop
+
+ ; auto-regression code
+ movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
+ lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
+ jmp r2
+
+.ar1:
+%if ARCH_X86_32
+ DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
+%elif WIN64
+ DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov ecx, [fg_dataq+FGData.ar_coeff_shift]
+%if ARCH_X86_32
+ mov r1m, cf3d
+ DEFINE_ARGS buf, shift, val3, min, max, x, val0
+%define hd r0mp
+%define cf3d r1mp
+%elif WIN64
+ DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+%endif
+ pxor m6, m6
+ pcmpgtb m7, m6, m4
+ punpcklbw m4, m7
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pshuflw m3, m3, q0000
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ movq m0, [bufq+xq-82-1] ; top/left
+ pcmpgtb m7, m6, m0
+ punpcklbw m0, m7
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend+1]
+ SCRATCH 7, 15, 7
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ pxor m7, m7
+ pshuflw m6, m6, q0000
+ punpcklwd m6, m7
+ pcmpgtb m4, m7, m0
+ pcmpgtb m5, m7, m1
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ DEFINE_ARGS buf, fg_data, h, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m7, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m6, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m6, m1
+ psrldq m5, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m5, m1
+ pmaddwd m4, m9
+ pmaddwd m6, m10
+ pmaddwd m5, m12
+ paddd m4, m6
+ paddd m2, m5
+ paddd m2, m4
+ paddd m2, m14
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pcmpgtb m4, m7, m0
+ punpcklbw m1, m0, m4
+ pmaddwd m3, m1, m13
+ paddd m3, m2
+ psrldq m1, 4 ; y=0,x=0
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw m3, m1
+ packsswb m3, m3
+ pslldq m3, 2
+ pand m3, m15
+ pandn m1, m15, m0
+ por m0, m1, m3
+ psrldq m0, 1
+ ; overwrite 2 pixels, but that's ok
+ movd [bufq+xq-1], m0
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, shift
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+%elif WIN64
+ SUB rsp, 16*6
+%assign stack_size_padded (stack_size_padded+16*6)
+%assign stack_size (stack_size+16*6)
+%else
+ ALLOC_STACK -16*6
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend]
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pxor m3, m3
+ pcmpgtb m4, m3, m0
+ pcmpgtb m3, m2
+ pshuflw m6, m6, q0000
+ SCRATCH 6, 14, 12
+ SCRATCH 7, 15, 13
+ punpckhbw m1, m0, m4
+ punpcklbw m0, m4
+ punpcklbw m2, m3
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m5, m0, q3333
+ pshufd m0, m0, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m3
+ mova [rsp+ 2*16], m4
+ mova [rsp+ 3*16], m5
+ pshufd m6, m1, q1111
+ pshufd m7, m1, q2222
+ pshufd m5, m1, q3333
+ pshufd m1, m1, q0000
+ pshufd m3, m2, q1111
+ psrldq m0, m2, 10
+ pinsrw m2, [base+pw_1], 5
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+ pinsrw m0, [base+round_vals+shiftq*2-10], 3
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m6
+ SCRATCH 7, 8, 6
+ SCRATCH 5, 9, 7
+ SCRATCH 2, 10, 8
+ SCRATCH 3, 11, 9
+ SCRATCH 4, 12, 10
+ SCRATCH 0, 13, 11
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m2, m0, m3
+ punpcklbw m0, m3
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ palignr m6, m2, m0, 10
+ palignr m7, m2, m0, 12
+ psrldq m0, 8
+ punpcklwd m0, m6
+ punpcklwd m7, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m7, [rsp+ 3*16]
+ paddd m0, m7
+ paddd m0, m4
+
+ psrldq m4, m1, 2
+ psrldq m5, m1, 4
+ psrldq m6, m1, 6
+ psrldq m7, m1, 8
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 4*16]
+ pmaddwd m6, [rsp+ 5*16]
+ paddd m4, m6
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m5, m2, m7
+ punpcklbw m2, m7
+ palignr m7, m3, m1, 10
+ palignr m3, m1, 12
+ psrldq m1, m2, 2
+ punpcklwd m7, m3
+ punpcklwd m3, m2, m1
+ pmaddwd m7, m8
+ pmaddwd m3, m9
+ paddd m7, m3
+ paddd m0, m7
+
+ psrldq m6, m2, 4
+ psrldq m1, m2, 6
+ psrldq m3, m2, 8
+ palignr m4, m5, m2, 10
+ palignr m5, m5, m2, 12
+
+ punpcklwd m6, m1
+ punpcklwd m3, m4
+ punpcklwd m5, m14
+ pmaddwd m6, m10
+ pmaddwd m3, m11
+ pmaddwd m5, m12
+ paddd m0, m6
+ paddd m3, m5
+ paddd m0, m3
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpcklbw m2, m1, m5
+ pmaddwd m2, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb m2, m2
+ pslldq m2, 3
+ pand m2, m15
+ pandn m3, m15, m1
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+ movifnidn r2, r2mp
+ movifnidn r3, r3mp
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movd m6, [base+round+r5*2]
+ mova m5, [base+pb_mask]
+ movd m0, [fg_dataq+FGData.seed]
+ movd m2, [base+pw_seed_xor+uvq*4]
+ pxor m0, m2
+ pshuflw m6, m6, q0000
+ pshuflw m0, m0, q0000
+ lea r6, [base+gaussian_sequence]
+%if %2
+%if ARCH_X86_64
+ mov r7d, 73-35*%3
+%else
+ mov r3mp, 73-35*%3
+%endif
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+.loop_x:
+%else
+ mov r5, -82*73
+ sub bufq, r5
+.loop:
+%endif
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m3, m2
+ psllq m2, m3, 15
+ por m3, m2 ; aggregate each bit into next seed's high bit
+ pmulhuw m2, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ movd r9d, m2
+ pshuflw m2, m2, q3232
+ movzx r8, r9w
+ shr r9, 16
+
+ movd m3, [r6+r8*2]
+ pinsrw m3, [r6+r9*2], 1
+
+ movd r9d, m2
+ movzx r8, r9w
+ shr r9, 16
+
+ pinsrw m3, [r6+r8*2], 2
+ pinsrw m3, [r6+r9*2], 3
+%else
+ movd r2, m2
+ pshuflw m2, m2, q3232
+ movzx r1, r2w
+ shr r2, 16
+
+ movd m3, [r6+r1*2]
+ pinsrw m3, [r6+r2*2], 1
+
+ movd r2, m2
+ movzx r1, r2w
+ shr r2, 16
+
+ pinsrw m3, [r6+r1*2], 2
+ pinsrw m3, [r6+r2*2], 3
+%endif
+ pmulhrsw m3, m6
+ packsswb m3, m3
+ movd [bufq+r5], m3
+ add r5, 4
+%if %2
+ jl .loop_x
+ add bufq, 82
+%if ARCH_X86_64
+ dec r7d
+%else
+ dec r3mp
+%endif
+ jg .loop_y
+%else
+ jl .loop
+%endif
+
+%if ARCH_X86_32
+ mov r2, r2mp
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -2*16
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd m4, [base+hmul_bits+shiftq*2]
+ DEFINE_ARGS buf, bufy, h, x
+ pxor m0, m0
+ pcmpgtb m0, m5
+ punpcklbw m5, m0
+ movd m7, [base+pb_1]
+%if %2
+ movd m6, [base+hmul_bits+2+%3*2]
+%endif
+ pshuflw m5, m5, q0000
+ pshuflw m4, m4, q0000
+ pshufd m7, m7, q0000
+%if %2
+ pshuflw m6, m6, q0000
+%endif
+ punpcklqdq m5, m5
+ punpcklqdq m4, m4
+%if %2
+ punpcklqdq m6, m6
+%endif
+ pcmpeqw m1, m1
+ pslldq m1, 12>>%2
+ SCRATCH 1, 8, 0
+ SCRATCH 4, 9, 1
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
+ add bufyq, 3+82*3
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ xor xd, xd
+.x_loop_ar0:
+ ; first 32 pixels
+%if %2
+ movu m1, [bufyq+xq*2]
+%if %3
+ movu m2, [bufyq+xq*2+82]
+%endif
+ movu m3, [bufyq+xq*2+16]
+%if %3
+ movu m4, [bufyq+xq*2+82+16]
+%endif
+ pmaddubsw m0, m7, m1
+%if %3
+ pmaddubsw m1, m7, m2
+%endif
+ pmaddubsw m2, m7, m3
+%if %3
+ pmaddubsw m3, m7, m4
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ pmulhrsw m0, m6
+ pmulhrsw m2, m6
+%else
+ movu m0, [bufyq+xq]
+ pxor m6, m6
+ pcmpgtb m6, m0
+ punpckhbw m2, m0, m6
+ punpcklbw m0, m6
+%endif
+ pmullw m0, m5
+ pmullw m2, m5
+ pmulhrsw m0, m9
+ pmulhrsw m2, m9
+ movu m1, [bufq+xq]
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpckhbw m3, m1, m4
+%if %2
+ punpcklbw m1, m4
+ paddw m2, m3
+ paddw m0, m1
+%else
+ punpcklbw m6, m1, m4
+ paddw m2, m3
+ paddw m0, m6
+%endif
+ packsswb m0, m2
+%if %2
+ movu [bufq+xq], m0
+ add xd, 16
+ cmp xd, 32
+ jl .x_loop_ar0
+
+ ; last 6/12 pixels
+ movu m1, [bufyq+xq*(1+%2)]
+%if %3
+ movu m2, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m0, m7, m1
+%if %3
+ pmaddubsw m1, m7, m2
+ paddw m0, m1
+%endif
+ pmulhrsw m0, m6
+ pmullw m0, m5
+ pmulhrsw m0, m9
+ movq m1, [bufq+xq]
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ paddw m0, m2
+ packsswb m0, m0
+ pandn m2, m8, m0
+ pand m1, m8
+ por m2, m1
+ movq [bufq+xq], m2
+%else
+ add xd, 16
+ cmp xd, 80
+ je .y_loop_final_ar0
+ movu [bufq+xq-16], m0
+ jmp .x_loop_ar0
+.y_loop_final_ar0:
+ pandn m2, m8, m0
+ pand m1, m8
+ por m2, m1
+ movu [bufq+xq-16], m2
+%endif
+
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+.ar1:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
+ pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
+%if ARCH_X86_32
+ mov r3mp, cf3d
+ DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
+ movd m7, [base+pb_1]
+ movd m6, [base+hmul_bits+2+%3*2]
+%endif
+ psrldq m4, 1
+%if ARCH_X86_32
+ DEFINE_ARGS buf, shift, val0, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
+%endif
+ pxor m5, m5
+ punpcklwd m3, m5
+%if %2
+ punpcklwd m6, m6
+%endif
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ pshufd m3, m3, q0000
+%if %2
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+%if ARCH_X86_32
+ add r1mp, 79+82*3
+ mov r0mp, 70-35*%3
+%else
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+%endif
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+%if %2
+%if ARCH_X86_32
+ mov r2, r1mp
+ movq m0, [r2+xq*2]
+%if %3
+ movq m1, [r2+xq*2+82]
+%endif
+%else
+ movq m0, [bufyq+xq*2]
+%if %3
+ movq m1, [bufyq+xq*2+82]
+%endif
+%endif
+ pmaddubsw m2, m7, m0
+%if %3
+ pmaddubsw m0, m7, m1
+ paddw m2, m0
+%endif
+ pmulhrsw m2, m6
+%else
+%if ARCH_X86_32
+ mov r2, r1mp
+ movd m2, [r2+xq]
+%else
+ movd m2, [bufyq+xq]
+%endif
+ pxor m0, m0
+ pcmpgtb m0, m2
+ punpcklbw m2, m0
+%endif
+
+ movq m0, [bufq+xq-82-1] ; top/left
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m1, m2
+ psrldq m2, m0, 2 ; top
+ punpcklwd m0, m2
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+%if ARCH_X86_32
+ imul val3d, r3mp
+%else
+ imul val3d, cf3d
+%endif
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+%if ARCH_X86_32
+ add r1mp, 82<<%3
+ dec r0mp
+%else
+ add bufyq, 82<<%3
+ dec hd
+%endif
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+ ALLOC_STACK -8*16
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ movd m7, [base+round_vals-12+shiftq*2]
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12
+ pxor m2, m2
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ pinsrw m1, [base+pw_1], 5
+ punpcklwd m7, m7
+ pshufd m7, m7, q0000
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ SCRATCH 7, 15, 7
+%if %2
+ movd m7, [base+hmul_bits+2+%3*2]
+ movd m6, [base+pb_1]
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ pxor m2, m2
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m0, m1
+ psrldq m3, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ pmaddwd m4, m9
+ pmaddwd m0, m10
+ pmaddwd m3, m12
+ paddd m4, m0
+ paddd m2, m3
+ paddd m2, m4
+
+%if %2
+ movq m1, [bufyq+xq*2]
+%if %3
+ movq m3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m0, m6, m1
+%if %3
+ pmaddubsw m1, m6, m3
+ paddw m0, m1
+%endif
+ pmulhrsw m0, m7
+%else
+ movd m0, [bufyq+xq]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+%endif
+ punpcklwd m0, m15
+ pmaddwd m0, m14
+ paddd m2, m0
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+ pxor m4, m4
+ movd m5, [base+byte_blend+1]
+ punpcklbw m5, m5
+.x_loop_ar2_inner:
+ pcmpgtb m1, m4, m0
+ punpcklbw m0, m1
+ pmaddwd m3, m0, m13
+ paddd m3, m2
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq m3, 4
+ pand m3, m5
+ paddw m0, m3
+ packsswb m0, m0
+ movd [bufq+xq-2], m0
+ psrldq m0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+ ALLOC_STACK -15*16
+%else
+ SUB rsp, 16*7
+%assign stack_size_padded (stack_size_padded+16*7)
+%assign stack_size (stack_size+16*7)
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ pshufd m2, m0, q1111
+ pshufd m3, m0, q2222
+ pshufd m4, m0, q3333
+ pshufd m0, m0, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m7, m1, q3333
+ pshufd m1, m1, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m2
+ mova [rsp+ 2*16], m3
+ mova [rsp+ 3*16], m4
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m5
+ mova [rsp+ 6*16], m6
+ SCRATCH 7, 8, 7
+
+ movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma]
+ pxor m4, m4
+ pcmpgtb m4, m2
+ punpckhbw m5, m2, m4
+ punpcklbw m2, m4
+ pshufd m4, m2, q3232
+ punpcklwd m3, m4, m5
+ pshuflw m5, m4, q3321
+ pshufd m4, m3, q0000
+ pshufd m3, m2, q1111
+ pshufd m2, m2, q0000
+ pinsrw m5, [base+round_vals+shiftq*2-10], 3
+ SCRATCH 2, 9, 8
+ SCRATCH 3, 10, 9
+ SCRATCH 4, 11, 10
+ SCRATCH 5, 12, 11
+
+ movd m2, [base+round_vals-12+shiftq*2]
+%if %2
+ movd m1, [base+pb_1]
+ movd m3, [base+hmul_bits+2+%3*2]
+%endif
+ pxor m0, m0
+ punpcklwd m2, m0
+%if %2
+ punpcklwd m3, m3
+%endif
+ pshufd m2, m2, q0000
+%if %2
+ pshufd m1, m1, q0000
+ pshufd m3, m3, q0000
+ SCRATCH 1, 13, 12
+%endif
+ SCRATCH 2, 14, 13
+%if %2
+ SCRATCH 3, 15, 14
+%endif
+
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m4, m4
+ pcmpgtb m4, m0
+ punpckhbw m3, m0, m4
+ punpcklbw m0, m4
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ palignr m2, m3, m0, 10
+ palignr m3, m0, 12
+ psrldq m0, 8
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m6, m6
+ pcmpgtb m6, m1
+ punpckhbw m5, m1, m6
+ punpcklbw m1, m6
+
+ punpcklwd m0, m2
+ punpcklwd m3, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m3, [rsp+ 3*16]
+ paddd m0, m3
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m6, m2, m7
+ punpcklbw m2, m7
+
+ palignr m3, m5, m1, 10
+ palignr m5, m1, 12
+ psrldq m4, m2, 2
+
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m4
+ pmaddwd m3, [rsp+ 6*16]
+ pmaddwd m5, m8
+ paddd m3, m5
+ paddd m0, m3
+
+ psrldq m3, m1, 2
+ psrldq m4, m1, 4
+ psrldq m5, m1, 6
+ psrldq m1, 8
+
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ pmaddwd m3, [rsp+ 4*16]
+ pmaddwd m5, [rsp+ 5*16]
+ paddd m3, m5
+ paddd m0, m3
+
+%if %2
+ movq m1, [bufyq+xq*2]
+%if %3
+ movq m3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m7, m13, m1
+%if %3
+ pmaddubsw m5, m13, m3
+ paddw m7, m5
+%endif
+ pmulhrsw m7, m15
+%else
+ movd m7, [bufyq+xq]
+ pxor m1, m1
+ pcmpgtb m1, m7
+ punpcklbw m7, m1
+%endif
+
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ palignr m4, m6, m2, 10
+ palignr m6, m2, 12
+ psrldq m2, 8
+
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+ punpcklwd m6, m7
+ pmaddwd m1, m9
+ pmaddwd m2, m10
+ pmaddwd m6, m11
+ paddd m1, m2
+ paddd m0, m6
+ paddd m0, m1
+ paddd m0, m14
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ pxor m4, m4
+ movd m5, [base+byte_blend]
+.x_loop_ar3_inner:
+ pcmpgtb m2, m4, m1
+ punpcklbw m3, m1, m2
+ pmaddwd m2, m3, m12
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw, we only care about one value
+ packsswb m2, m2
+ pandn m3, m5, m1
+ pslld m2, 24
+ pand m2, m5
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 6
+%define %%tmp %6
+%endif
+%rep 4
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4]
+%else
+ pinsrw %1, [%3+%4], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+INIT_XMM ssse3
+; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+ mov [rsp+5*mmsize+ 4*gprsize], r0
+ mov [rsp+5*mmsize+ 6*gprsize], r1
+ mov [rsp+5*mmsize+ 8*gprsize], r2
+ mov [rsp+5*mmsize+10*gprsize], r3
+ mov [rsp+5*mmsize+11*gprsize], r4
+ mov [rsp+5*mmsize+12*gprsize], r5
+%else
+cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+5*mmsize+ 4*gprsize]
+%define r1m [rsp+5*mmsize+ 5*gprsize]
+%define r2m [rsp+5*mmsize+ 6*gprsize]
+%define r3m [rsp+5*mmsize+ 7*gprsize]
+%define r4m [rsp+5*mmsize+ 8*gprsize]
+%define r5m [rsp+5*mmsize+ 9*gprsize]
+%define r6m [rsp+5*mmsize+10*gprsize]
+%define r7m [rsp+5*mmsize+11*gprsize]
+%define r8m [rsp+5*mmsize+12*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r7, [pb_mask]
+%define base r7-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ movd m4, [base+max+r6*4]
+ movd m5, [base+min+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m4, m4
+ punpcklwd m5, m5
+ pshufd m3, m3, q0000
+ pshufd m4, m4, q0000
+ pshufd m5, m5, q0000
+ SCRATCH 3, 11, 0
+ SCRATCH 4, 12, 1
+ SCRATCH 5, 13, 2
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz .no_vertical_overlap
+ mova m6, [base+pw_1024]
+ mova m7, [base+pb_27_17_17_27]
+ SCRATCH 6, 14, 3
+ SCRATCH 7, 15, 4
+ test sbyd, sbyd
+ jnz .vertical_overlap
+ ; fall-through
+
+.no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused
+%endif
+
+.loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+
+ add offxyd, 16
+ test dword r8m, 2 ; r8m & 2 = have_top_overlap
+ jz .loop_x_odd
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jnz .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
+
+ add offxyd, 16 ; left_offxyd
+ mov [rsp+5*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+0*gprsize]
+ movd m7, [grain_lutq+r5]
+%else
+ movd m7, [grain_lutq+left_offxyq]
+%endif
+ punpcklbw m7, m3
+ pmaddubsw m6, m15, m7
+ pmulhrsw m6, m14
+ packsswb m6, m6
+ shufps m6, m3, q3210
+ pcmpgtb m2, m6
+ punpcklbw m7, m6, m2
+ punpckhbw m6, m2
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m7, m4
+ pmullw m6, m5
+ pmulhrsw m7, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m7
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ xor dword r8m, 4
+ add offxyd, 16
+
+ ; since this half-block had left-overlap, the next does not
+ test dword r8m, 2 ; have_top_overlap
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
+ ; because of the 'and tmpd, 0x00ff00ff' above
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+5*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+12], r5
+%else
+ mova m8, [pb_27_17]
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+1*gprsize]
+ movu m7, [grain_lutq+r5]
+%else
+ movu m7, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m7, m3
+ punpcklbw m7, m3
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+12]
+ pmaddubsw m3, [r5], m6
+ pmaddubsw m6, [r5], m7
+%else
+ pmaddubsw m3, m8, m6
+ pmaddubsw m6, m8, m7
+%endif
+ pmulhrsw m3, m14
+ pmulhrsw m6, m14
+ packsswb m6, m3
+ pcmpgtb m7, m2, m6
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m6, m5
+ pmulhrsw m2, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+12], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ btc hd, 16
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ btc dword r8m, 2
+ jc .loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+12], r5
+
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
+
+ mov r5, [rsp+5*mmsize+1*gprsize]
+ mov r4, offxyd
+ add r5, 16
+ add r4, 16
+ mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
+ mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
+
+ xor tmpd, tmpd
+ mov seed, r3m
+%else
+ mova m8, [pb_27_17]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+
+ movzx r5, offxyw ; top_offxy
+ mov [rsp+5*mmsize+1*gprsize], r5
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ movzx top_offxyd, offxyw
+%endif
+ shr offxyd, 16
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
+ mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
+ movu m6, [grain_lutq+r5]
+ mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
+ movd m4, [grain_lutq+r0]
+ movd m7, [grain_lutq+r5]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+ movd m7, [grain_lutq+topleft_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m4, m3
+ punpcklbw m7, m6
+ pmaddubsw m2, m15, m4
+ pmaddubsw m4, m15, m7
+ pmulhrsw m2, m14
+ pmulhrsw m4, m14
+ packsswb m2, m2
+ packsswb m4, m4
+ shufps m2, m3, q3210
+ shufps m4, m6, q3210
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m3, m4, m2
+ punpckhbw m4, m2
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+12]
+ pmaddubsw m7, [r5], m4
+ pmaddubsw m4, [r5], m3
+%else
+ pmaddubsw m7, m8, m4
+ pmaddubsw m4, m8, m3
+%endif
+ pmulhrsw m7, m14
+ pmulhrsw m4, m14
+ packsswb m4, m7
+ pxor m2, m2
+ pcmpgtb m7, m2, m4
+ punpcklbw m3, m4, m7
+ punpckhbw m4, m7
+
+ ; src
+ mova m0, [srcq]
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m5, m0, scalingq-1, r0, r5, m7
+ vpgatherdw m6, m1, scalingq-1, r0, r5, m7
+%else
+ vpgatherdw m5, m0, scalingq-1, r13, r14, m7
+ vpgatherdw m6, m1, scalingq-1, r13, r14, m7
+%endif
+ REPX {psrlw x, 8}, m5, m6
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m3, m5
+ pmullw m4, m6
+ pmulhrsw m3, m11
+ pmulhrsw m4, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+12], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ btc hd, 16
+ jnc .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ xor dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
+; sby, luma, lstride, uv_pl, is_id)
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
+cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov [rsp+7*mmsize+3*gprsize], r0
+ mov [rsp+7*mmsize+5*gprsize], r1
+ mov [rsp+7*mmsize+7*gprsize], r2
+ mov [rsp+7*mmsize+9*gprsize], r3
+ mov [rsp+7*mmsize+10*gprsize], r4
+
+ mov r0, r8m
+ mov r1, r9m
+ mov r2, r10m
+ mov r4, r11m
+ mov r3, r12m
+ mov [rsp+7*mmsize+11*gprsize], r0
+ mov [rsp+7*mmsize+12*gprsize], r1
+ mov [rsp+7*mmsize+13*gprsize], r2
+ mov [rsp+7*mmsize+14*gprsize], r4
+%else
+cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+7*mmsize+ 3*gprsize]
+%define r1m [rsp+7*mmsize+ 4*gprsize]
+%define r2m [rsp+7*mmsize+ 5*gprsize]
+%define r3m [rsp+7*mmsize+ 6*gprsize]
+%define r4m [rsp+7*mmsize+ 7*gprsize]
+%define r5m [rsp+7*mmsize+ 8*gprsize]
+%define r6m [rsp+7*mmsize+ 9*gprsize]
+%define r7m [rsp+7*mmsize+10*gprsize]
+%define r8m [rsp+7*mmsize+11*gprsize]
+%define r9m [rsp+7*mmsize+12*gprsize]
+%define r10m [rsp+7*mmsize+13*gprsize]
+%define r11m [rsp+7*mmsize+14*gprsize]
+%define r12m [rsp+7*mmsize+15*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, r5
+%else
+cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ lea tmpd, [r6d*2]
+%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
+ test r3, r3
+%else
+ cmp dword r12m, 0 ; is_idm
+%endif
+ movd m5, [base+min+r6*2]
+ cmovne r6d, tmpd
+ movd m4, [base+max+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m5, m5
+ punpcklwd m4, m4
+ pshufd m3, m3, q0000
+ pshufd m5, m5, q0000
+ pshufd m4, m4, q0000
+ SCRATCH 3, 11, 0
+ SCRATCH 4, 12, 1
+ SCRATCH 5, 13, 2
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+%if %1
+ mov r6d, dword r11m
+ movd m0, [fg_dataq+FGData.uv_mult+r6*4]
+ movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklbw m6, m1, m0
+ movd m7, [fg_dataq+FGData.uv_offset+r6*4]
+ punpcklwd m6, m6
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ SCRATCH 6, 14, 3
+ SCRATCH 7, 15, 4
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz %%no_vertical_overlap
+%if ARCH_X86_32
+%if %2
+ mova m1, [base+pb_23_22_h]
+%else
+ mova m1, [base+pb_27_17_17_27]
+%endif
+ mova m0, [base+pw_1024]
+%else
+%if %2
+ mova m1, [pb_23_22_h]
+%else
+ mova m1, [pb_27_17_17_27]
+%endif
+ mova m0, [pw_1024]
+%endif
+ SCRATCH 0, 8, 5
+ SCRATCH 1, 9, 6
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+ ; fall-through
+
+%%no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+%define luma_bakq lumaq
+
+ mov wq, r4m
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r12mp, strideq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
+%endif
+
+%%loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq+ 0]
+ pcmpgtb m6, m2, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; we already incremented lumaq above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2 == 0
+ ; adjust top_offxy
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jc %%loop_x_even
+ test dword r8m, 2
+ jz %%loop_x_odd
+ jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
+%endif
+ test dword r8m, 1
+ jz %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+%if %2
+ lea r6, [offxyd+16]
+ mov [rsp+7*mmsize+0*gprsize], r6
+%else
+ mov [rsp+7*mmsize+0*gprsize], offxyd
+%endif
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+%if %2
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%else
+ mov left_offxyd, offyd
+%endif
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq+ 0]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+0*gprsize]
+ movd m2, [grain_lutq+r0+ 0]
+%else
+ movd m2, [grain_lutq+left_offxyq+ 0]
+%endif
+ punpcklbw m2, m4
+ pmaddubsw m3, m9, m2
+ pmulhrsw m3, m8
+ packsswb m3, m3
+ shufps m3, m4, q3210
+ pxor m4, m4
+ pcmpgtb m4, m3
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2 == 0
+ xor dword r8m, 4
+ ; adjust top_offxyd
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+%endif
+
+ ; r8m = sbym
+ test dword r8m, 2
+%if %2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ jne %%loop_x_odd_v_overlap
+ jmp %%loop_x_odd
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r12mp, strideq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor tmpd, tmpd
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+7*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+%%loop_x_odd_v_overlap:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %3
+ mova m1, [PIC_ptr(pb_23_22)]
+%else
+ mova m1, [PIC_ptr(pb_27_17)]
+%endif
+%%loop_y_v_overlap:
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+1*gprsize]
+ movu m4, [grain_lutq+r0]
+%else
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, m1, m6
+ pmaddubsw m3, m1, m4
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ packsswb m3, m2
+ pxor m6, m6
+ pcmpgtb m6, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; unpack chroma_source
+ pxor m4, m4
+ punpckhbw m6, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m6, m3
+ pmaxsw m0, m13
+ pmaxsw m6, m13
+ pminsw m0, m12
+ pminsw m6, m12
+ packuswb m0, m6
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ dec hw
+ je %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+%if %3 == 0
+ btc hd, 16
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ mova m1, [PIC_ptr(pb_17_27)]
+ jnc %%loop_y_v_overlap
+%endif
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jnc %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
+
+ mov r6, [rsp+7*mmsize+1*gprsize]
+%if %2
+ lea r0, [r3d+16]
+ add r6, 16
+ mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
+%else
+ mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
+%endif
+ mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
+
+ mov seed, r3m
+ xor tmpd, tmpd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+%if %2
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offxyq+16]
+%else
+ mov topleft_offxyq, top_offxyq
+ mov left_offxyq, offxyq
+%endif
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+7*mmsize+1*gprsize], top_offxyd
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %3
+ mova m3, [PIC_ptr(pb_23_22)]
+%else
+ mova m3, [PIC_ptr(pb_27_17)]
+%endif
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
+ mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
+ movd m1, [grain_lutq+r0]
+ mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
+%else
+ movd m1, [grain_lutq+topleft_offxyq]
+%endif
+ movu m2, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ movu m6, [grain_lutq+r5]
+ movd m4, [grain_lutq+r0]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m1, m6
+ punpcklbw m4, m2
+ pmaddubsw m0, m9, m1
+ pmaddubsw m1, m9, m4
+ REPX {pmulhrsw x, m8}, m0, m1
+ packsswb m0, m1
+ shufps m4, m0, m2, q3232
+ shufps m0, m6, q3210
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m2, m0, m4
+ punpckhbw m0, m4
+ pmaddubsw m4, m3, m0
+ pmaddubsw m1, m3, m2
+ pmulhrsw m4, m8
+ pmulhrsw m1, m8
+ packsswb m1, m4
+
+ ; src
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+%if %3
+ vpgatherdw m7, m4, scalingq-1, r2, r12
+ vpgatherdw m5, m6, scalingq-1, r2, r12
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r13
+ vpgatherdw m5, m6, scalingq-1, r2, r13
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack grain
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ punpckhbw m1, m4
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m1, m5
+ pmulhrsw m2, m11
+ pmulhrsw m1, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; unpack chroma source
+ pxor m4, m4
+ punpckhbw m5, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m5, m1
+ pmaxsw m0, m13
+ pmaxsw m5, m13
+ pminsw m0, m12
+ pminsw m5, m12
+ packuswb m0, m5
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has been adjusted above already
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+ add lumaq, r10mp
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ mova m3, [PIC_ptr(pb_17_27)]
+ btc hd, 16
+ jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+ jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+%endif
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ xor dword r8m, 4
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0
diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h
new file mode 100644
index 0000000000..415a4d8d62
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(dav1d_##name, ssse3)); \
+ decl_##type##_fn(BF(dav1d_##name, avx2)); \
+ decl_##type##_fn(BF(dav1d_##name, avx512icl))
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = BF(dav1d_##name, suffix)
+
+#define init_angular_ipred_fn(type, name, suffix) \
+ init_fn(intra_pred, type, name, suffix)
+#define init_cfl_pred_fn(type, name, suffix) \
+ init_fn(cfl_pred, type, name, suffix)
+#define init_cfl_ac_fn(type, name, suffix) \
+ init_fn(cfl_ac, type, name, suffix)
+
+decl_fn(angular_ipred, ipred_dc);
+decl_fn(angular_ipred, ipred_dc_128);
+decl_fn(angular_ipred, ipred_dc_top);
+decl_fn(angular_ipred, ipred_dc_left);
+decl_fn(angular_ipred, ipred_h);
+decl_fn(angular_ipred, ipred_v);
+decl_fn(angular_ipred, ipred_paeth);
+decl_fn(angular_ipred, ipred_smooth);
+decl_fn(angular_ipred, ipred_smooth_h);
+decl_fn(angular_ipred, ipred_smooth_v);
+decl_fn(angular_ipred, ipred_z1);
+decl_fn(angular_ipred, ipred_z2);
+decl_fn(angular_ipred, ipred_z3);
+decl_fn(angular_ipred, ipred_filter);
+
+decl_fn(cfl_pred, ipred_cfl);
+decl_fn(cfl_pred, ipred_cfl_128);
+decl_fn(cfl_pred, ipred_cfl_top);
+decl_fn(cfl_pred, ipred_cfl_left);
+
+decl_fn(cfl_ac, ipred_cfl_ac_420);
+decl_fn(cfl_ac, ipred_cfl_ac_422);
+decl_fn(cfl_ac, ipred_cfl_ac_444);
+
+decl_fn(pal_pred, pal_pred);
+
+static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+#if BITDEPTH == 8
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3);
+#endif
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
+
+ c->pal_pred = BF(dav1d_pal_pred, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx2);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+#endif
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
new file mode 100644
index 0000000000..7ddb189916
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -0,0 +1,4992 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHTS 1-*
+const smooth_weights_1d_16bpc ; sm_weights[] << 7
+ %rep %0
+ dw %1*128
+ %rotate 1
+ %endrep
+const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
+ %rep %0
+ dw %1, 256-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+%if ARCH_X86_64
+
+ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11
+ db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15
+filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1
+filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+pw_m1024: times 2 dw -1024
+pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4
+z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8
+pb_90: times 4 db 90
+z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4
+z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11
+z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9
+z_filter_k: dw 4, 4, 5, 5, 4, 4
+ dw 8, 8, 6, 6, 4, 4
+ dw 0, 0, 0, 0, 2, 2
+
+%define pw_2 (z_filter_k+32)
+%define pw_4 (z_filter_k+ 0)
+%define pw_16 (z2_ymul8 +20)
+
+pw_1: times 2 dw 1
+pw_3: times 2 dw 3
+pw_62: times 2 dw 62
+pw_512: times 2 dw 512
+pw_2048: times 2 dw 2048
+pd_8: dd 8
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
+%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
+
+JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ paddw m0, [tlq+96]
+ paddw m0, [tlq+64]
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm3
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ lea stride3q, [strideq*3]
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw xm0, xm0
+.s4:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm3, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw xm0, xm0
+.s8:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+.s32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq- 96]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 32]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ paddw m0, [tlq+34]
+ paddw m1, [tlq+66]
+ paddw m0, [tlq+98]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm1, xm4
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w64_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ shr r6d, 11
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+34]
+ movu m2, [tlq+66]
+ movu m3, [tlq+98]
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ lea r5, [ipred_h_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_XMM avx2
+.w4:
+ IPRED_H 4, q
+.w8:
+ IPRED_H 8, a
+INIT_YMM avx2
+.w16:
+ IPRED_H 16, a
+.w32:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ sub tlq, 4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*0+32*2], m0
+ mova [dstq+strideq*0+32*3], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m1
+ mova [dstq+strideq*1+32*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m1
+ psubw m7, m3, m0 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m7, m7
+ pabsw m0, m0
+ pminsw m7, m0
+ pcmpeqw m0, m7
+ pcmpgtw m7, m%3, m7
+ vpblendvb m0, m3, m%1, m0
+ vpblendvb m0, m1, m0, m7
+%endmacro
+
+cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
+%define base r5-ipred_paeth_16bpc_avx2_table
+ movifnidn hd, hm
+ lea r5, [ipred_paeth_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastq m2, [tlq+2] ; top
+ movsldup m6, [base+ipred_hv_shuf]
+ lea r3, [strideq*3]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m1, [tlq]
+ pshufb m1, m6 ; left
+ PAETH 2, 4, 5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m2, [tlq+2]
+ movsldup m6, [base+ipred_hv_shuf]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m1, [tlq]
+ pshufb m1, m6
+ PAETH 2, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m2, [tlq+2]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m2, [tlq+2]
+ movu m6, [tlq+34]
+%if WIN64
+ movaps r4m, xmm8
+ movaps r6m, xmm9
+%endif
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ pabsw m5, m4
+ pabsw m9, m8
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps xmm8, r4m
+ movaps xmm9, r6m
+%endif
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 16
+ movu m2, [tlq+ 2]
+ movu m6, [tlq+34]
+ movu m10, [tlq+66]
+ movu m13, [tlq+98]
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ psubw m11, m10, m3
+ psubw m14, m13, m3
+ pabsw m5, m4
+ pabsw m9, m8
+ pabsw m12, m11
+ pabsw m15, m14
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ PAETH 10, 11, 12
+ mova [dstq+32*2], m0
+ PAETH 13, 14, 15
+ mova [dstq+32*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_16bpc_avx2_table
+ lea r6, [ipred_smooth_v_16bpc_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m5, [tlq+hq*2] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m3
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movq [dstq+r6 ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vbroadcasti128 m4, [tlq+2]
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w8_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ vextracti128 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], xm0
+ vextracti128 [dstq+strideq*2], m1, 1
+ mova [dstq+r6 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ movu m4, [tlq+2]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w16_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r6 ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [tlq+ 2]
+ movu m6, [tlq+34]
+ psubw m4, m5
+ psubw m6, m5
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [tlq+ 2]
+ movu m4, [tlq+34]
+ movu m6, [tlq+66]
+ movu m7, [tlq+98]
+ REPX {psubw x, m5}, m3, m4, m6, m7
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+%define base r6-ipred_smooth_h_16bpc_avx2_table
+ lea r6, [ipred_smooth_h_16bpc_avx2_table]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m5, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [r6+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m3
+ psubw m0, m5 ; left - right
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m3
+ pshufb m1, m3
+ psubw m0, m5
+ psubw m1, m5
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movu m4, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [base+smooth_weights_1d_16bpc+32*2]
+ movu m6, [base+smooth_weights_1d_16bpc+32*3]
+.w32_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m5
+ psubw m3, m5
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [base+smooth_weights_1d_16bpc+32*4]
+ movu m4, [base+smooth_weights_1d_16bpc+32*5]
+ movu m6, [base+smooth_weights_1d_16bpc+32*6]
+ movu m7, [base+smooth_weights_1d_16bpc+32*7]
+.w64_loop:
+ vpbroadcastw m2, [tlq+hq-2]
+ psubw m2, m5
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ sub hq, 1*2
+ jg .w64_loop
+ RET
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddwd m0, m%1, m%3
+ pmaddwd m1, m%2, m%4
+ paddd m0, m%5
+ paddd m1, m%6
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+%endmacro
+
+cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_16bpc_avx2_table
+ lea r6, [ipred_smooth_16bpc_avx2_table]
+ mov wd, wm
+ vpbroadcastw m4, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ sub tlq, hq
+ sub tlq, hq
+ movsxd wq, [r6+wq*4]
+ pxor m5, m5
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ vpbroadcastq m6, [tlq+hq*2+2]
+ movsldup m7, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4]
+ punpcklwd m6, m0 ; top, bottom
+ punpcklqdq m8, m9, m9
+ punpckhqdq m9, m9
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m3, [tlq+hq*2-8]
+ vbroadcasti128 m1, [v_weightsq]
+ pshufb m3, m7
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m10
+ pshufb m0, m1, m8
+ pshufb m1, m9
+ SMOOTH_2D_END 0, 1, 6, 6, 2, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 16
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vpbroadcastw m0, [tlq] ; bottom
+ vbroadcasti128 m7, [tlq+hq*2+2]
+ movsldup m8, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
+ vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w8_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastq m1, [v_weightsq]
+ pshufb m3, m8
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m11
+ pshufb m1, m9
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+2]
+ mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
+ mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
+ vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
+ vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastd m1, [v_weightsq+0]
+ punpcklwd m3, m4 ; left, right
+ pshufd m2, m3, q1111
+ pmaddwd m10, m8, m2
+ pmaddwd m2, m9
+ pshufd m3, m3, q0000
+ SMOOTH_2D_END 1, 1, 6, 7, 10, 2
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ mova [dstq+strideq*0], m0
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hq, 2
+ jg .w16_loop
+ RET
+.w32:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+ 2]
+ movu m9, [tlq+hq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+.w32_loop:
+ vpbroadcastw m3, [tlq+hq*2-2]
+ vpbroadcastd m14, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m1, m10, m3
+ pmaddwd m2, m11, m3
+ pmaddwd m0, m6, m14
+ paddd m0, m1
+ pmaddwd m1, m7, m14
+ paddd m1, m2
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 14, 14, 8, 9, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+%assign stack_offset stack_offset - stack_size_padded
+ PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
+ mov dst_baseq, dstq
+ mov tl_baseq, tlq
+ mov v_weights_baseq, v_weightsq
+ xor xq, xq
+.w64_loop_x:
+ mov yq, hq
+ lea tlq, [tl_baseq+hq*2]
+ vpbroadcastw m0, [tl_baseq] ; bottom
+ movu m7, [tlq+xq*2+ 2]
+ movu m9, [tlq+xq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+ lea tlq, [tl_baseq-2]
+.w64_loop_y:
+ vpbroadcastw m3, [tlq+yq*2]
+ vpbroadcastd m1, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m14, m10, m3
+ pmaddwd m15, m11, m3
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ pmaddwd m0, m6, m1
+ paddd m0, m14
+ pmaddwd m14, m7, m1
+ paddd m14, m15
+ psrld m0, 8
+ psrld m14, 8
+ packssdw m0, m14
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 8, 9, 1, 1, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec yq
+ jg .w64_loop_y
+ lea dstq, [dst_baseq+32*2]
+ add r6, 16*8
+ mov v_weightsq, v_weights_baseq
+ add xq, 32
+ test xb, 64
+ jz .w64_loop_x
+ RET
+
+cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ movsxd wq, [r6+wq*4]
+ add tlq, 2
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m5, [pw_62]
+ jmp wq
+.w4:
+ ALLOC_STACK -64, 7
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ vpbroadcastw xm3, [tlq+14]
+ movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
+ paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ add dxd, dxd
+ palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
+ paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d
+ psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
+ psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
+ pxor xm4, xm4
+ paddw xm2, xm0
+ vpbroadcastw xm0, r8m ; pixel_max
+ mova [rsp+32], xm3
+ movd xm3, dxd
+ pmaxsw xm2, xm4
+ mov r3d, dxd
+ pavgw xm2, xm4
+ vpbroadcastw m3, xm3
+ pminsw xm2, xm0
+ punpcklwd xm0, xm1, xm2
+ punpckhwd xm1, xm2
+ lea r5, [strideq*3]
+ pslldq m2, m3, 8
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ paddw m3, m2
+ vpblendd m4, m6, 0xf0
+ paddw m6, m6
+ paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3
+ vbroadcasti128 m4, [z_upsample]
+.w4_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu xm2, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [rsp+r3*2], 1 ; 0 2
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base3
+ vinserti128 m2, [rsp+r2*2], 1 ; 1 3
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3 ; frac
+ psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6
+ psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6)
+ pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
+ paddw m3, m6 ; xpos += dx
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r5 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+%define base r3-z_filter_t0
+ movd xm0, maxbased
+ lea r3, [z_filter_t0]
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ mova xm2, [r3+angleq*8]
+ pand m0, m1
+ pcmpgtb m0, m2
+ pmovmskb r5d, m0
+ ret
+.w4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastw xm3, [tlq+14]
+ mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8
+ pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ paddw xm2, xm0
+ pmullw xm2, xm4
+ movd [rsp+16], xm3
+ cmp r5d, 3
+ jne .w4_3tap
+ paddw xm1, xm2
+ palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8
+ pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
+ movzx r3d, word [tlq+14]
+ movzx r2d, word [tlq+12]
+ inc maxbased
+ paddw xm2, xm0
+ sub r2d, r3d
+ paddw xm2, xm2
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
+ mov [rsp+16], r2w
+.w4_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ mov tlq, rsp
+ psrlw xm1, 3
+ cmp hd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm3, dxd
+ vpbroadcastq m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd ; xpos
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0 ; -max_base_x
+ vpblendd m3, m4, 0xcc
+ paddw m0, m4, m3
+ vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ paddw m4, m4
+ paddw m3, m1
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [tlq+r3*2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [tlq+r3*2], 1 ; 0 2
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1 ; 1 3
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ vpblendd m1, m2, 0xcc
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; xpos < max_base_x
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w4_loop
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r6 ], xm6
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 7
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _
+ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _
+ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 4
+ jne .w8_upsample_h8 ; awkward single-pixel edge case
+ vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _
+.w8_upsample_h8:
+ paddw m2, m1
+ paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ add dxd, dxd
+ psubw m0, m2, m0
+ psraw m0, 3
+ pxor m4, m4
+ paddw m2, m0
+ vpbroadcastw m0, r8m
+ movd xm3, dxd
+ pmaxsw m2, m4
+ mov r3d, dxd
+ pavgw m2, m4
+ vpbroadcastw m3, xm3
+ pminsw m2, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vbroadcasti128 m4, [z_upsample]
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ vextracti128 [rsp+32], m0, 1
+ vextracti128 [rsp+48], m1, 1
+ vpblendd m3, m6, 0xf0 ; xpos0 xpos1
+.w8_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ movu xm2, [rsp+r3*2+16]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ vinserti128 m1, [rsp+r2*2], 1
+ vinserti128 m2, [rsp+r2*2+16], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m6
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m2
+ cmp hd, 8
+ jl .w8_filter_h4
+ punpckhwd m2, m2
+ vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
+ movzx r3d, word [tlq+30]
+ mov maxbased, 16
+ mov [rsp+32], r3d
+ cmp r5d, 3
+ jne .w8_filter_end
+ punpcklwd xm6, xm0, xm0
+ vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq+28]
+ mov [rsp+34], r3w
+ paddw m2, m6
+ sub r5d, r3d
+ inc maxbased
+ paddw m2, m2
+ lea r3d, [r5+r3*8+4]
+ paddw m1, m2
+ shr r3d, 3
+ mov [rsp+32], r3w
+ jmp .w8_filter_end
+.w8_filter_h4:
+ pshuflw m3, m2, q3321
+ vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _
+.w8_filter_end:
+ paddw m0, m3
+ pmullw m0, m4
+ mov tlq, rsp
+ pxor m2, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ mova [tlq], m0
+.w8_main:
+ movd xm3, dxd
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0
+ vpblendd m3, m4, 0xf0 ; xpos0 xpos1
+ paddw m3, m1
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu xm0, [tlq+r3*2]
+ movu xm1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ vinserti128 m0, [tlq+r5*2], 1
+ vinserti128 m1, [tlq+r5*2+2], 1
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+.w8_end_loop:
+ mova [dstq+strideq*0], xm6
+ mova [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 7
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main
+ popcnt r5d, r5d
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ cmp r5d, 3
+ jne .w16_filter_3tap
+ vpbroadcastd m2, [base+pw_3]
+ punpcklwd xm0, xm0
+ vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m0, m2
+ pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m0, m1
+ psrlw m0, 2
+ movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 8
+ jl .w16_filter_5tap_h4
+ punpckhwd m3, m3
+ je .w16_filter_5tap_h8
+ vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r3d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m4
+ sub r2d, r3d
+ paddw m1, m3
+ lea r2d, [r2+r3*8+4]
+ paddw m1, m2
+ shr r2d, 3
+ psrlw m1, 2
+ mov [rsp+66], r3w
+ mov [rsp+64], r2w
+ mov tlq, rsp
+ mov r3d, 33
+ cmp hd, 16
+ cmovg maxbased, r3d
+ jmp .w16_filter_end2
+.w16_filter_5tap_h8:
+ vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_5tap_h4:
+ pshuflw xm4, xm3, q3332 ; 4 5 5 5
+ pshuflw xm3, xm3, q3321 ; 3 4 5 5
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_3tap:
+ vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m4
+ pmullw m3, m2
+ paddw m0, m1
+ cmp hd, 8
+ je .w16_filter_3tap_h8
+ jl .w16_filter_3tap_h4
+ punpckhwd m2, m2
+ vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ jmp .w16_filter_end
+.w16_filter_3tap_h4:
+ pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _
+ jmp .w16_filter_end
+.w16_filter_3tap_h8:
+ psrldq xm2, 2
+ pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8
+.w16_filter_end:
+ paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m2, m4
+ psrlw m0, 3
+ pxor m1, m1
+ paddw m2, m3
+ psrlw m2, 3
+ pavgw m0, m1
+ pavgw m1, m2
+.w16_filter_end2:
+ mov tlq, rsp
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w16_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m1, m0
+ movu m0, [tlq+r5*2]
+ vpblendvb m2, m6, m1, m2
+ movu m1, [tlq+r5*2+2]
+ mova [dstq+strideq*0], m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 8
+ lea maxbased, [hq+31]
+ mov r3d, 63
+ cmp hd, 32
+ cmova maxbased, r3d
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r3], m0
+.w32_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w32_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ jl .w32_filter_h8
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r5d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r3+32], m0
+ mov [r3+66], r5w
+ mov [r3+64], r2w
+ mov tlq, rsp
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ jmp .w32_main
+.w32_filter_h8:
+ vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ mov tlq, rsp
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r3+32], xm0
+.w32_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m7, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*1], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+.w32_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [hq+63]
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [hq+32]
+ psrlw m0, 2
+ mova [r3], m0
+.w64_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w64_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ mov tlq, rsp
+ psrlw m0, 2
+ mova [r3+32], m0
+.w64_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ paddw m8, m7, m7 ; -32 * 64
+ psubw m3, m0
+ paddw m9, m8, m7 ; -48 * 64
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*1], m0
+ movu m0, [tlq+r3*2+64]
+ movu m1, [tlq+r3*2+66]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*2], m0
+ movu m0, [tlq+r3*2+96]
+ movu m1, [tlq+r3*2+98]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m9, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*3], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ mova [dstq+32*2], m6
+ mova [dstq+32*3], m6
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ mova m1, [tlq- 0]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mova m2, [tlq- 32]
+ mov r8, dxq
+ sub dxq, dyq
+ mova m3, [tlq- 64]
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_16bpc_avx2_table
+ mova m4, [tlq- 96]
+ and dyd, ~1
+ mova m5, [tlq-128]
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m11, [base+pw_62]
+ mova [rsp+128], m1
+ mova [rsp+ 96], m2
+ mova [rsp+ 64], m3
+ neg dxd
+ mova [rsp+ 32], m4
+ neg dyq
+ mova [rsp+ 0], m5
+ jmp wq
+.w4:
+ vbroadcasti128 m10, [base+z2_x_shuf]
+ vpbroadcastq m6, [base+z_base_inc+2]
+ lea r8d, [dxq+(65<<6)] ; xpos
+ mov r10d, (63-4)<<6
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ movq xm0, [tlq+2] ; 1 2 3 4
+ movq xm1, [tlq+0] ; 0 1 2 3
+ pshuflw xm2, xm0, q3321 ; 2 3 4 4
+ pshuflw xm3, xm1, q2100 ; 0 0 1 2
+ vpbroadcastw xm4, r8m ; pixel_max
+ vbroadcasti128 m10, [base+z_upsample]
+ paddw xm1, xm0
+ paddw xm2, xm3
+ lea r8d, [r8+dxq+(1<<6)]
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ pxor xm3, xm3
+ sub r10d, 3<<6
+ paddw xm1, xm2
+ paddw m6, m6
+ pmaxsw xm1, xm3
+ sub angled, 1075 ; angle - 53
+ pavgw xm1, xm3
+ lea r3d, [hq+3]
+ pminsw xm1, xm4
+ xor angled, 0x7f ; 180 - angle
+ punpcklwd xm1, xm0
+ movu [rsp+130], xm1
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1
+ movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0
+ vpbroadcastw xm4, r8m ; pixel_max
+ cmp hd, 8
+ je .upsample_left_h8
+ pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2
+ pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0
+ jmp .upsample_left_end
+.upsample_left_h8:
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
+ pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
+.upsample_left_end:
+ paddw xm1, xm0
+ paddw xm2, xm3
+ psubw xm2, xm1, xm2
+ add dyq, dyq
+ psraw xm2, 3
+ pxor xm3, xm3
+ paddw xm1, xm2
+ pmaxsw xm1, xm3
+ pavgw xm1, xm3
+ pminsw xm1, xm4
+ punpcklwd xm2, xm0, xm1
+ punpckhwd xm0, xm1
+ mova [rsp+ 96+gprsize], xm2
+ mova [rsp+112+gprsize], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ psrldq xm0, xm1, 2 ; 1 2 3 4
+ pshuflw xm2, xm1, q2100 ; 0 0 1 2
+ pmullw xm4, xm0
+ pshuflw xm3, xm0, q3321 ; 2 3 4 4
+ paddw xm1, xm3
+ pshuflw xm3, xm0, q3332 ; 3 4 4 4
+ pmullw xm1, xm5
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2]
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm2, xm5
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11 ; clip to byte range since there's no variable word blend
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movq [rsp+130], xm1
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ cmp r3d, 3
+ je .w4_filter_left_s3
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w4_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w4_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w4_filter_left_end
+.w4_upsample_left:
+ call .upsample_left
+ mov r11, -16
+ vbroadcasti128 m9, [base+z_upsample]
+ jmp .w4_main_upsample_left
+.w4_filter_left_s3: ; can only be h16
+ movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m4, [base+pw_3]
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ punpcklwd xm3, xm0, xm0
+ paddw m2, m4
+ vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ jmp .w4_filter_left_end2
+.w4_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w4_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+.w4_filter_left_end2:
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+.w4_main:
+ vbroadcasti128 m9, [base+z2_x_shuf]
+ mov r11, -8
+.w4_main_upsample_left:
+ movd xm5, dyd
+ mova m4, [base+z2_y_shuf_h4]
+ mov r2d, r8d
+ movd xm0, dxd
+ vpbroadcastw m5, xm5
+ rorx r5, dyq, 5
+ lea r8d, [dyq*3]
+ pmullw m5, [base+z2_ymul]
+ rorx r9, dyq, 4
+ sar dyd, 6
+ vpbroadcastw m0, xm0
+ sar r8d, 6
+ pand m5, m11 ; frac_y
+ neg dyd
+ psllw m5, 9
+ add r5d, dyd
+ add r8d, dyd
+ add r9d, dyd
+ paddw m7, m0, m0
+ lea dyq, [rsp+dyq*2+126]
+ vpblendd m0, m7, 0xcc
+ add dyq, r11
+ neg r5d
+ paddw m1, m0, m7
+ neg r8d
+ vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ neg r9d
+ paddw m7, m7
+ paddw m6, m0
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm3, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m3, [rsp+r3*2], 1
+ pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
+ pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
+ pand m2, m11, m6
+ punpcklqdq m0, m1, m3
+ punpckhqdq m1, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ movu xm2, [dyq]
+ vinserti128 m2, [dyq+r8*2], 1
+ movu xm3, [dyq+r5*2]
+ vinserti128 m3, [dyq+r9*2], 1
+ pshufb m2, m9
+ pshufb m3, m9
+ punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m5
+ psraw m3, m6, 15 ; base_x < topleft
+ paddw m1, m2
+ vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3
+ vpblendvb m0, m1, m3
+.w4_toponly:
+ paddw m6, m7 ; xpos += dx
+ lea r3, [strideq*3]
+ add dyq, r11
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r10d
+ jge .w4_loop
+.w4_leftonly_loop:
+ movu xm1, [dyq]
+ vinserti128 m1, [dyq+r8*2], 1
+ movu xm2, [dyq+r5*2]
+ vinserti128 m2, [dyq+r9*2], 1
+ add dyq, r11
+ pshufb m1, m9
+ pshufb m2, m9
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m5
+ paddw m0, m1
+ vpermd m0, m4, m0
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ mov r10d, hd
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ xor r8d, r8d
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8
+ mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7
+ pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
+ pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastw xm4, r8m ; pixel_max
+ paddw xm1, xm0
+ paddw xm2, xm3
+ not r8d
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ sub angled, 53 ; angle - 53
+ pxor xm3, xm3
+ paddw xm2, xm1
+ lea r3d, [hq+7]
+ pmaxsw xm2, xm3
+ xor angled, 0x7f ; 180 - angle
+ pavgw xm2, xm3
+ pminsw xm2, xm4
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ movu [rsp+130], xm1
+ movu [rsp+146], xm2
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2]
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x
+ pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
+ pmullw xm4, xm0
+ pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
+ paddw xm1, xm3
+ vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm1, xm5
+ pmullw xm2, xm6
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movu [rsp+130], xm1
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w8_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 16 ; flags needed for later
+ jmp .filter_left_s3b
+.w8_upsample_left:
+ call .upsample_left
+ vbroadcasti128 m7, [base+z2_y_shuf_us]
+ lea r11, [rsp+118]
+ mov r8, -8
+ jmp .w8_main_upsample_left
+.w16_filter_left_s12:
+ xor r8d, r8d
+.w8_filter_left_s12:
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w8_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w8_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w8_filter_left_end
+.w8_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w8_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+ test r8d, r8d
+ jz .w8_main
+; upsample_main
+ vbroadcasti128 m10, [base+z_upsample]
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r5, [rsp+120]
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ paddw m4, m4
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0
+ lea r2d, [dxq+(66<<6)] ; xpos
+ paddw m4, m2
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0
+ pand m6, m11
+ punpckhwd xm9, xm8, xm1
+ psllw m6, 9
+ punpcklwd xm8, xm1
+.w8_upsample_above_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ movu xm2, [rsp+r2*2+16]
+ lea r2d, [r3+dxq]
+ shr r3d, 6
+ vinserti128 m1, [rsp+r3*2], 1
+ vinserti128 m2, [rsp+r3*2+16], 1
+ pshufb m1, m10
+ pshufb m2, m10
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_upsample_above_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15
+ vpblendvb m0, m1, m2
+.w8_upsample_above_toponly:
+ paddw m4, m5
+ sub r5, 4
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_ret
+ lea dstq, [dstq+strideq*2]
+ jmp .w8_upsample_above_loop
+.w8_main:
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r11, [rsp+120]
+ mov r8, -4
+.w8_main_upsample_left:
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0 ; xpos0 xpos1
+ lea r9d, [dxq+(65<<6)] ; xpos
+ paddw m4, m2
+ movd [rsp+284], xm1
+.w8_loop0:
+ mov r2d, r9d
+ mova [rsp+288], m0
+ mov r5, r11
+ mova [rsp+320], m4
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0 ; base_y
+ pand m6, m11 ; frac_y
+ punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7
+ psllw m6, 9
+ punpcklwd xm8, xm1 ; base_y 0 1 4 5
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2*2]
+ movu xm1, [rsp+r2*2+2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3*2], 1
+ vinserti128 m1, [rsp+r3*2+2], 1
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1
+ pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1
+ punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w8_toponly:
+ paddw m4, m5 ; xpos += dx
+ add r5, r8
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-8)<<6
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m5
+ vpgatherdq m4, [r5+xm9*2], m5
+ mova m5, m0
+ vpgatherdq m3, [r5+xm8*2], m0
+ add r5, r8
+ pshufb m2, m4, m7
+ pshufb m1, m3, m7
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m6
+ paddw m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_leftonly_loop
+.w8_end:
+ sub r10d, 1<<8
+ jl .w8_ret
+ vpbroadcastd m0, [rsp+284]
+ add r7, 16
+ paddw m0, [rsp+288] ; base_y += 8*dy
+ add r9d, 8<<6
+ vpbroadcastd m4, [pw_512]
+ movzx hd, r10b
+ paddw m4, [rsp+320] ; base_x += 8*64
+ mov dstq, r7
+ jmp .w8_loop0
+.w8_ret:
+ RET
+.w16:
+ movd xm0, [tlq+32]
+ lea r10d, [hq+(1<<8)]
+ movd [rsp+160], xm0
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2]
+ movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ punpcklwd xm2, xm1, xm1
+ vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ punpckhwd m3, m0, m0
+ pmullw m4, m0
+ vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ paddw m1, m3
+ vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ paddw m2, m3
+ vpbroadcastd m3, r6m ; max_width
+ pmullw m1, m5
+ pmullw m2, m6
+ packssdw m3, m3
+ paddw m1, m4
+ paddw m1, m2
+ psubw m3, [base+pw_1to16]
+ pxor m4, m4
+ psrlw m1, 3
+ pminsw m3, m11
+ pavgw m1, m4
+ vpblendvb m1, m0, m3
+ movu [rsp+130], m1
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w16_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 4
+ jne .filter_left_s3
+ movq xm0, [tlq-8] ; 0 1 2 3
+ movq xm1, [tlq-6] ; 1 2 3 4
+ vpbroadcastd xm5, r7m ; max_height
+ movq xm4, [base+pw_16to1+24] ; 4to1
+ pshuflw xm2, xm0, q2100 ; 0 0 1 2
+ pshuflw xm3, xm1, q3321 ; 2 3 4 4
+ paddw xm1, xm0
+ paddw xm1, xm2
+ pshuflw xm2, xm0, q1000 ; 0 0 0 1
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, xm4
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ movq [rsp+120], xm1
+ jmp .w8_main
+.w32:
+ mova m2, [tlq+32]
+ movd xm0, [tlq+64]
+ lea r10d, [hq+(3<<8)]
+ mova [rsp+160], m2
+ movd [rsp+192], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m0, r6m ; max_width
+ vpbroadcastd m7, [base+pw_16]
+ mov r3d, 32
+ packssdw m0, m0
+ psubw m0, [base+pw_1to16]
+ pminsw m8, m0, m11
+ psubw m9, m8, m7
+.w32_filter_above:
+ movu m0, [tlq+2]
+ punpcklwd xm4, xm1, xm1
+ paddw m2, m6, [tlq+6]
+ paddw m1, m0
+ vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+4]
+ movu m3, [tlq+r3+2]
+ paddw m5, m6, [tlq+r3-2]
+ pavgw m2, m4
+ punpckhwd m4, m3, m3
+ paddw m1, m2
+ vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m5
+ paddw m5, m3, [tlq+r3]
+ paddw m4, m5
+ psrlw m1, 2
+ paddw m2, m4
+ vpblendvb m1, m0, m8
+ psrlw m2, 2
+ vpblendvb m2, m3, m9
+ movu [rsp+130], m1
+ movu [rsp+r3+130], m2
+.filter_left_s3:
+ cmp hd, 16
+ jl .filter_left_s3_h8 ; h8
+.filter_left_s3b:
+ mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ vpbroadcastd m5, r7m ; max_height
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ mov r3d, hd
+ vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ packssdw m5, m5
+ not r3
+ psubw m5, [base+pw_16to1]
+ paddw m2, m6
+ pminsw m8, m11, m5
+ je .filter_left_s3_end ; h16
+ paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j
+ psubw m8, m7
+ mova [rsp+96], m3
+ jnp .filter_left_s3_end ; h32
+ mova m5, [tlq-96]
+ paddw m1, [tlq-66]
+ pavgw m2, [tlq-68]
+ paddw m1, m2
+ paddw m4, m5, [tlq-94]
+ paddw m2, m6, [tlq-92]
+ psrlw m1, 2
+ paddw m4, [tlq- 98]
+ pavgw m2, [tlq-100]
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-128]
+ psubw m8, m7
+ paddw m4, m2
+ paddw m1, m0, [tlq-126]
+ paddw m2, m6, [tlq-124]
+ psrlw m4, 2
+ mova [rsp+64], m3
+ vpblendvb m4, m5, m8
+ psubw m8, m7
+ mova [rsp+32], m4
+.filter_left_s3_end:
+ punpcklwd xm3, xm0, xm0
+ vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m1, m0, m8
+ mova [rsp+r3*2+130], m1
+ jmp .w8_main
+.filter_left_s3_h8:
+ mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastd xm5, r7m ; max_height
+ paddw xm1, xm0, xm3
+ pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8
+ paddw xm1, xm2
+ vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, [base+pw_16to1+16] ; 8to1
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ mova [rsp+112], xm1
+ jmp .w8_main
+.w64:
+ mova m2, [tlq+ 32]
+ mova m3, [tlq+ 64]
+ mova m4, [tlq+ 96]
+ movd xm0, [tlq+128]
+ lea r10d, [hq+(7<<8)]
+ mova [rsp+160], m2
+ mova [rsp+192], m3
+ mova [rsp+224], m4
+ movd [rsp+256], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h
+ paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movu m4, [tlq+66]
+ paddw m3, m6, [tlq+62]
+ paddw m7, m4, [tlq+64]
+ pavgw m3, [tlq+70]
+ paddw m7, [tlq+68]
+ paddw m2, m5
+ vpbroadcastd m5, r6m ; max_width
+ mov r3d, 96
+ packssdw m5, m5
+ paddw m3, m7
+ psubw m5, [base+pw_1to16]
+ psrlw m2, 2
+ vpbroadcastd m7, [base+pw_16]
+ psrlw m3, 2
+ pminsw m8, m11, m5
+ psubw m9, m8, m7
+ vpblendvb m2, m0, m9
+ psubw m9, m7
+ vpblendvb m3, m4, m9
+ psubw m9, m7
+ movu [rsp+162], m2
+ movu [rsp+194], m3
+ jmp .w32_filter_above
+
+cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_16bpc_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ sub tlq, 2
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m5, [pw_62]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ ALLOC_STACK -64, 7
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ pshufd xm3, xm1, q0000
+ paddw xm1, xm2
+ paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastw xm4, r8m ; pixel_max
+ add dyd, dyd
+ psubw xm0, xm1, xm0
+ mova [rsp+ 0], xm3
+ movd xm3, dyd
+ psraw xm0, 3
+ neg dyd
+ paddw xm1, xm0
+ pxor xm0, xm0
+ lea r2d, [dyq+(16<<6)+63] ; ypos
+ pmaxsw xm1, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m3, xm3
+ pminsw xm1, xm4
+ punpckhwd xm0, xm1, xm2
+ punpcklwd xm1, xm2
+ paddw m2, m3, m3
+ mova [rsp+32], xm0
+ punpcklwd m3, m2
+ mova [rsp+16], xm1
+ paddw m4, m2, m2
+ paddw m2, m3
+ vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movu xm2, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vinserti128 m2, [rsp+r4*2], 1
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m4
+ paddw m1, m0
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ add dstq, 8
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ pand m0, m1
+ mova xm1, [r4+angleq*8]
+ pcmpgtb m0, m1
+ pmovmskb r5d, m0
+ ret
+.h4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw xm2, xm0
+ pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm1, xm0, xm3
+ movd [rsp+12], xm0
+ pmullw xm1, xm4
+ cmp r5d, 3
+ jne .h4_filter_3tap
+ pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
+ vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ movzx r4d, word [tlq-14]
+ movzx r2d, word [tlq-12]
+ inc maxbased
+ paddw xm1, xm2
+ paddw xm0, xm3
+ sub r2d, r4d
+ paddw xm2, xm0, xm0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+14], r2w
+.h4_filter_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ lea tlq, [rsp+30]
+ psrlw xm1, 3
+ cmp wd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [rsp+16], xm0
+.h4_main:
+ movd xm3, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ lea r4d, [maxbaseq+3*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 8
+ lea r4, [dyq+63] ; ypos
+ punpcklwd m1, m1
+ paddw m0, m3, m3
+ vpbroadcastw m2, xm2
+ punpcklwd m3, m0
+ paddw m4, m0, m0
+ paddw m0, m3
+ psubw m2, m1
+ vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+ or maxbased, 63
+ paddw m3, m2
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ vinserti128 m1, [tlq+r4*2], 1
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ pand m2, m5, m3
+ palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; ypos < max_base_y
+ paddw m3, m4
+ paddw m1, m0
+ vpblendvb m1, m6, m1, m2
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ sub wd, 4
+ jz .h4_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h4_loop
+.h4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r7 ], xm6
+ add dstq, 8
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+.h8:
+ lea r4d, [angleq+216]
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 8
+ mov r4b, wb
+ lea r7, [strideq*3]
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d
+ cmp wd, 8
+ je .h8_upsample_w8
+ pshufhw xm3, xm2, q1000
+ vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d
+.h8_upsample_w8:
+ paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastw m4, r8m ; pixel_max
+ add dyd, dyd
+ psubw m0, m1, m0
+ movd xm6, dyd
+ psraw m0, 3
+ neg dyd
+ paddw m1, m0
+ pxor m0, m0
+ pmaxsw m1, m0
+ lea r4d, [dyq+(16<<6)+63] ; ypos
+ pavgw m1, m0
+ vpbroadcastw m6, xm6
+ pminsw m1, m4
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ vextracti128 [rsp+48], m0, 1
+ vextracti128 [rsp+32], m1, 1
+ paddw m7, m6, m6
+ mova [rsp+16], xm0
+ mova [rsp+ 0], xm1
+ punpcklwd m6, m7 ; ypos0 ypos1
+.h8_upsample_loop:
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base0
+ movu m1, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base1
+ movu m2, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base2
+ movu m3, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base3
+ movu m4, [rsp+r2*2]
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ psrld m2, m3, 16
+ pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ pand m4, m5, m6
+ paddw m6, m7
+ psllw m4, 9
+ psubw m1, m0
+ pmulhrsw m1, m4
+ pand m4, m5, m6
+ psllw m4, 9
+ psubw m3, m2
+ pmulhrsw m3, m4
+ paddw m6, m7
+ lea r2, [dstq+strideq*4]
+ paddw m1, m0
+ paddw m3, m2
+ punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ movhps [r2 +strideq*0], xm0
+ movq [r2 +strideq*1], xm0
+ movhps [r2 +strideq*2], xm1
+ movq [r2 +r7 ], xm1
+ movhps [dstq+strideq*0], xm2
+ movq [dstq+strideq*1], xm2
+ movhps [dstq+strideq*2], xm3
+ movq [dstq+r7 ], xm3
+ add dstq, 8
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main
+ popcnt r5d, r5d
+ mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m0
+ cmp wd, 8
+ jl .h8_filter_w4
+ punpcklwd xm0, xm0
+ vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movd [rsp+28], xm0
+ paddw m1, m3
+ mov r4d, 16
+ pmullw m1, m4
+ cmovg maxbased, r4d
+ cmp r5d, 3
+ jne .h8_filter_3tap
+ punpckhwd m3, m3
+ vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ movzx r4d, word [tlq-30]
+ movzx r2d, word [tlq-28]
+ inc maxbased
+ paddw m1, m2
+ paddw m0, m3
+ sub r2d, r4d
+ paddw m2, m0, m0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+30], r2w
+ jmp .h8_filter_3tap
+.h8_filter_w4:
+ pshufhw xm1, xm0, q2100
+ vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e
+ paddw m1, m3
+ pmullw m1, m4
+.h8_filter_3tap:
+ pxor m0, m0
+ paddw m1, m2
+ lea tlq, [rsp+62]
+ psrlw m1, 3
+ pavgw m0, m1
+ mova [rsp+32], m0
+.h8_main:
+ movd xm4, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+7*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 16
+ lea r4, [dyq+63]
+ paddw m6, m4, m4
+ vpbroadcastw m2, xm2
+ vpblendd m4, m6, 0xf0 ; ypos0 ypos1
+ psubw m2, m1
+ or maxbased, 63
+ paddw m4, m2
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm0, [tlq+r4*2+2]
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vinserti128 m0, [tlq+r5*2+2], 1
+ vinserti128 m1, [tlq+r5*2], 1
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ paddw m0, m1
+ movu xm1, [tlq+r4*2+2]
+ movu xm2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m7, m0, m3
+ vinserti128 m1, [tlq+r5*2+2], 1
+ vinserti128 m2, [tlq+r5*2], 1
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ lea r5, [dstq+strideq*4]
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0
+ vextracti128 xm3, m2, 1
+ punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4
+ punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm3, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
+ punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h8_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h8_loop
+ lea r6, [strideq*5]
+ lea r2, [strideq+r7*2] ; stride*7
+ test wd, 4
+ jz .h8_end_loop
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ movq [dstq+strideq*2], xm7
+ movq [dstq+r7 ], xm7
+ movq [dstq+strideq*4], xm7
+ movq [dstq+r6 ], xm7
+ movq [dstq+r7*2 ], xm7
+ movq [dstq+r2 ], xm7
+ add dstq, 8
+ sub wd, 4
+ jz .h8_end
+.h8_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ mova [dstq+strideq*2], xm7
+ mova [dstq+r7 ], xm7
+ mova [dstq+strideq*4], xm7
+ mova [dstq+r6 ], xm7
+ mova [dstq+r7*2 ], xm7
+ mova [dstq+r2 ], xm7
+ add dstq, 16
+ sub wd, 8
+ jg .h8_end_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 10
+ lea maxbased, [wq+15]
+ lea r7, [strideq*3]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pmullw m1, m7
+ paddw m1, m2
+ cmp wd, 8
+ jg .h16_filter_w16
+ mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pmullw xm6, xm3
+ jl .h16_filter_w4
+ pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+.h16_filter_w8_5tap:
+ punpckhwd m0, m0
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw xm4, xm4
+ paddw m0, m0
+ paddw xm6, xm4
+ paddw m1, m0
+.h16_filter_w8_3tap:
+ paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8
+ pmullw xm3, xm7
+ pxor m0, m0
+ paddw xm3, xm6
+ psrlw xm3, 3
+ pavgw xm3, xm0
+ mova [rsp+48], xm3
+ jmp .h16_filter_end
+.h16_filter_w4:
+ pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5
+ jmp .h16_filter_w8_5tap
+.h16_filter_w16:
+ mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m6, m3
+ punpcklwd xm3, xm3
+ vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ mov r4d, 32
+ cmp wd, 16
+ cmovg maxbased, r4d
+ movd [rsp+28], xm3
+ pmullw m4, m7
+ cmp r5d, 3
+ jne .h16_filter_w16_3tap
+ punpckhwd m0, m0
+ vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movzx r4d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ or maxbased, 1
+ paddw m3, m3
+ sub r2d, r4d
+ paddw m0, m0
+ lea r2d, [r2+r4*8+4]
+ paddw m4, m3
+ shr r2d, 3
+ paddw m1, m0
+ mov [rsp+30], r2w
+.h16_filter_w16_3tap:
+ pxor m0, m0
+ paddw m4, m6
+ psrlw m4, 3
+ pavgw m4, m0
+ mova [rsp+32], m4
+.h16_filter_end:
+ psrlw m1, 3
+ lea tlq, [rsp+94]
+ pavgw m1, m0
+ mova [rsp+64], m1
+.h16_main:
+ movd xm8, dyd
+ neg maxbaseq
+ vpbroadcastw m9, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m8, xm8
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm7, r4d
+ sub tlq, 32
+ lea r4, [dyq+63]
+ vpbroadcastw m7, xm7
+ or maxbased, 63
+ psubw m7, [z_base_inc]
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu m0, [tlq+r4*2+2]
+ movu m2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu m1, [tlq+r5*2+2]
+ movu m3, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base3
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m2, m0
+ pmulhrsw m2, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m0, m2
+ movu m2, [tlq+r4*2+2]
+ movu m4, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m9, m0, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m3, m1
+ pmulhrsw m3, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m1, m3
+ vpblendvb m1, m9, m1, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m2
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m2, m4
+ movu m3, [tlq+r5*2+2]
+ movu m4, [tlq+r5*2]
+ vpblendvb m2, m9, m2, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m3
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ lea r5, [dstq+strideq*4]
+ paddw m3, m4
+ vpblendvb m3, m9, m3, m6
+ punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0
+ vextracti128 xm6, m3, 1
+ punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4
+ punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6
+ vextracti128 xm2, m4, 1
+ movhps [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ vextracti128 xm6, m1, 1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ vextracti128 xm2, m0, 1
+ movhps [r5 +strideq*0], xm6
+ movq [r5 +strideq*1], xm6
+ movhps [r5 +strideq*2], xm2
+ movq [r5 +r7 ], xm2
+ lea r5, [dstq+strideq*8]
+ movhps [r5 +strideq*0], xm3
+ movq [r5 +strideq*1], xm3
+ movhps [r5 +strideq*2], xm4
+ movq [r5 +r7 ], xm4
+ lea r5, [r5+strideq*4]
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h16_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h16_loop
+ mov hd, 4
+.h16_end_loop0:
+ mov r6d, wd
+ mov r2, dstq
+ test wb, 4
+ jz .h16_end_loop
+ movq [dstq+strideq*0], xm9
+ movq [dstq+strideq*1], xm9
+ movq [dstq+strideq*2], xm9
+ movq [dstq+r7 ], xm9
+ and r6d, 120
+ jz .h16_end_w4
+ add dstq, 8
+.h16_end_loop:
+ mova [dstq+strideq*0], xm9
+ mova [dstq+strideq*1], xm9
+ mova [dstq+strideq*2], xm9
+ mova [dstq+r7 ], xm9
+ add dstq, 16
+ sub r6d, 8
+ jg .h16_end_loop
+.h16_end_w4:
+ lea dstq, [r2+strideq*4]
+ dec hd
+ jg .h16_end_loop0
+.h16_end:
+ RET
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 9
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400
+ jnz .h32_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+128]
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r4], m0
+.h32_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h32_filter_loop
+ jl .h32_filter_h8
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r4-32], m0
+ mov [r4-36], r5w
+ mov [r4-34], r2w
+ lea tlq, [rsp+158]
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ jmp .h32_main
+.h32_filter_h8:
+ mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8
+ vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+ lea tlq, [rsp+158]
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r4-16], xm0
+.h32_main:
+ movd xm6, dyd
+ neg maxbaseq
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm4, r4d
+ vpbroadcastd m8, [pw_m1024]
+ lea r4, [dyq+63]
+ vpbroadcastw m4, xm4
+ or maxbased, 63
+ psubw m4, [z_base_inc]
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ pcmpgtw m2, m8, m4
+ paddw m0, m1
+ vpblendvb m0, m7, m0, m2
+ movu m2, [tlq+r5*2-32]
+ movu m1, [tlq+r5*2-30]
+ add r4, dyq
+ sub rsp, 64
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ mova [rsp+32*0], m0
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ mova [rsp+32*1], m1
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 64
+ mova [rsp+32*0], m7
+ mova [rsp+32*1], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea r3, [strideq*3]
+ lea r4, [strideq*5]
+ mov r8, dstq
+ lea r5, [strideq+r3*2]
+.h32_transpose_loop0:
+ lea r6, [rsp+32]
+ lea r2, [r8+org_wq*2-16]
+.h32_transpose_loop:
+ mova m0, [r6+64*7]
+ mova m1, [r6+64*6]
+ mova m2, [r6+64*5]
+ mova m3, [r6+64*4]
+ mova m4, [r6+64*3]
+ mova m5, [r6+64*2]
+ mova m6, [r6+64*1]
+ mova m7, [r6+64*0]
+ punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
+ punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4
+ punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
+ punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4
+ lea dstq, [r2+strideq*8]
+ sub r6, 32
+ punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
+ punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2
+ punpckhqdq m5, m7, m1 ; 8 0
+ vextracti128 [r2 +strideq*0], m5, 1
+ punpcklqdq m7, m1 ; 9 1
+ mova [dstq+strideq*0], xm5
+ punpckhqdq m1, m8, m3 ; 10 2
+ vextracti128 [r2 +strideq*1], m7, 1
+ punpcklqdq m8, m3 ; 11 3
+ mova [dstq+strideq*1], xm7
+ punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
+ vextracti128 [r2 +strideq*2], m1, 1
+ punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6
+ mova [dstq+strideq*2], xm1
+ punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
+ vextracti128 [r2 +r3 ], m8, 1
+ punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6
+ mova [dstq+r3 ], xm8
+ punpckhqdq m6, m3, m2 ; 12 4
+ vextracti128 [r2 +strideq*4], m6, 1
+ punpcklqdq m3, m2 ; 13 5
+ mova [dstq+strideq*4], xm6
+ punpckhqdq m2, m0, m4 ; 14 6
+ vextracti128 [r2 +r4 ], m3, 1
+ punpcklqdq m0, m4 ; 15 7
+ mova [dstq+r4 ], xm3
+ vextracti128 [r2 +r3*2 ], m2, 1
+ mova [dstq+r3*2 ], xm2
+ vextracti128 [r2 +r5 ], m0, 1
+ mova [dstq+r5 ], xm0
+ lea r2, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h32_transpose_loop
+ add rsp, 64*8
+ sub org_wd, 8
+ jg .h32_transpose_loop0
+.h32_end:
+ RET
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [wq+63]
+ test angled, 0x400
+ jnz .h64_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+224]
+ paddw m0, m1
+ lea r5d, [wq+32]
+ psrlw m0, 2
+ mova [r4], m0
+.h64_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h64_filter_loop
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ lea tlq, [rsp+254]
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ psrlw m0, 2
+ mova [r4-32], m0
+.h64_main:
+ neg maxbaseq
+ movd xm4, dyd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ vpbroadcastd m7, [pw_m1024]
+ movd xm3, r4d
+ lea r4, [dyq+63]
+ paddw m8, m7, m7
+ vpbroadcastw m3, xm3
+ or maxbased, 63
+ paddw m9, m8, m7
+ psubw m3, [z_base_inc]
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-128]
+ movu m0, [tlq+r5*2-126]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ sub rsp, 128
+ paddw m0, m1
+ pcmpgtw m1, m9, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*0], m0
+ movu m1, [tlq+r5*2-96]
+ movu m0, [tlq+r5*2-94]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*1], m0
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*2], m0
+ movu m1, [tlq+r5*2-32]
+ movu m0, [tlq+r5*2-30]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ add r4, dyq
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [rsp+32*3], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 128
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m6
+ mova [rsp+32*2], m6
+ mova [rsp+32*3], m6
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ mov r5, dstq
+ lea r4, [strideq+r2*2]
+.h64_transpose_loop0:
+ lea r6, [rsp+112]
+ lea dstq, [r5+org_wq*2-32]
+.h64_transpose_loop:
+ mova xm0, [r6+128*15]
+ vinserti128 m0, [r6+128* 7], 1
+ mova xm1, [r6+128*14]
+ vinserti128 m1, [r6+128* 6], 1
+ mova xm2, [r6+128*13]
+ vinserti128 m2, [r6+128* 5], 1
+ mova xm3, [r6+128*12]
+ vinserti128 m3, [r6+128* 4], 1
+ mova xm4, [r6+128*11]
+ vinserti128 m4, [r6+128* 3], 1
+ mova xm5, [r6+128*10]
+ vinserti128 m5, [r6+128* 2], 1
+ mova xm6, [r6+128* 9]
+ vinserti128 m6, [r6+128* 1], 1
+ mova xm7, [r6+128* 8]
+ vinserti128 m7, [r6+128* 0], 1
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ sub r6, 16
+ punpckhdq m7, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpckhqdq m5, m7, m1
+ punpcklqdq m7, m1
+ punpckhqdq m1, m8, m3
+ punpcklqdq m8, m3
+ punpckhdq m3, m0, m2
+ mova [dstq+strideq*0], m5
+ punpckldq m0, m2
+ mova [dstq+strideq*1], m7
+ punpckhdq m2, m4, m6
+ mova [dstq+strideq*2], m1
+ punpckldq m4, m6
+ mova [dstq+r2 ], m8
+ punpckhqdq m6, m3, m2
+ mova [dstq+strideq*4], m6
+ punpcklqdq m3, m2
+ mova [dstq+r3 ], m3
+ punpckhqdq m2, m0, m4
+ mova [dstq+r2*2 ], m2
+ punpcklqdq m0, m4
+ mova [dstq+r4 ], m0
+ lea dstq, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 128*16
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pmaddwd m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddwd m%3, m3
+ paddd m%1, m1
+ paddd m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddwd m%3, m4
+ paddd m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddwd m%3, m5
+ paddd m%1, m%3
+ psrad m%1, 4
+ packusdw m%1, m%1
+ pminsw m%1, m%5
+%endmacro
+
+%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
+ pshufb m%2, m%6
+ vpermq m%4, m%2, q3232
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pshufd m%3, m%4, q0000
+ pmaddwd m%1, m2
+ pmaddwd m%3, m2
+ paddd m%1, m1
+ paddd m%3, m1
+ pshufd m%5, m%2, q1111
+ pmaddwd m%5, m3
+ paddd m%1, m%5
+ pshufd m%5, m%4, q1111
+ pmaddwd m%5, m3
+ paddd m%3, m%5
+ pshufd m%5, m%2, q2222
+ pmaddwd m%5, m4
+ paddd m%1, m%5
+ pshufd m%5, m%4, q2222
+ pmaddwd m%5, m4
+ paddd m%3, m%5
+ pshufd m%5, m%2, q3333
+ pmaddwd m%5, m5
+ paddd m%1, m%5
+ pshufd m%5, m%4, q3333
+ pmaddwd m%5, m5
+ paddd m%3, m%5
+ psrad m%1, 4
+ psrad m%3, 4
+ packusdw m%1, m%3
+ pminsw m%1, m%7
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
+%assign org_stack_offset stack_offset
+%define base r6-ipred_filter_16bpc_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_16bpc_avx2_table]
+ vbroadcasti128 m0, [tlq-6]
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pd_8]
+ pmovsxbw m2, [filterq+16*0]
+ pmovsxbw m3, [filterq+16*1]
+ pmovsxbw m4, [filterq+16*2]
+ pmovsxbw m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 10
+ mova xm8, [base+filter_shuf2]
+ vpbroadcastw m9, r8m ; bitdepth_max
+ lea r7, [6+hq*2]
+ sub tlq, r7
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrq xm0, [tlq+hq*2], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_1BLK 6, 0, 7, 8, 9
+ vextracti128 xm0, m6, 1
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vbroadcasti128 m14, [base+filter_shuf3]
+ vpbroadcastw m15, r8m ; bitdepth_max
+ FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15
+ vpermq m6, m10, q1302 ; ____ ____ | ____ 4321
+ pslldq m8, m0, 4
+ psrldq m7, m6, 2
+ psrldq m0, m6, 10
+ punpcklwd m7, m0
+ vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321
+ vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321
+ lea r7, [16+hq*2]
+ sub tlq, r7
+ jmp .w8_loop_start
+.w8_loop:
+ vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321
+ vpermq m6, m9, q2031
+ psrldq m0, m6, 2
+ psrldq m6, 10
+ punpcklwd m6, m0
+ vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321
+ mova m10, m9
+.w8_loop_start:
+ vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321
+ call .main
+ vpblendd m10, m9, 0xCC
+ mova [dstq+strideq*0], xm10
+ vextracti128 [dstq+strideq*1], m10, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ ALLOC_STACK 32, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+ mova xm10, [base+filter_shuf2]
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ vpbroadcastq m0, [tlq+10]
+ vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____
+ psrldq m6, m12, 8
+ vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321
+ punpcklwd m6, m12
+ vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm8, xm12, 12
+ vpblendd xm6, xm8, 0x01
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 8, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ movu m8, [tlq+6] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ lea r7, [20+hq*2]
+ sub tlq, r7
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w16_loop_start
+.w16_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40
+ mova m0, [rsp+8]
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w16_loop_start:
+ mova m13, m12
+ vpblendd m0, [tlq+hq*2], 0x0C
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+8], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ ret
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK 64, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ lea r3, [dstq+32]
+ lea r5d, [hd*2+20]
+ call .w16_main
+ mov dstq, r3
+ lea tlq, [tlq+r5+32]
+ sub r5d, 20
+ shr r5d, 1
+ sub r5d, 2
+ lea r4, [dstq+strideq*2-2]
+DEFINE_ARGS dst, stride, tl, stride3, left, h
+ lea stride3q, [strideq*3]
+ movu m8, [tlq-6] ; 4321 0___
+ mova xm10, [base+filter_shuf2]
+ pinsrw xm0, xm8, [dstq+strideq*0-2], 2
+ pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_
+ pinsrw xm9, [leftq+strideq*0], 5
+ pinsrw xm9, [leftq+strideq*1], 4
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ psrldq m6, m12, 8
+ punpcklwd m7, m6, m12
+ vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321
+ vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321
+ vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321
+ vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ pinsrw xm9, [leftq+strideq*2], 3
+ pinsrw xm9, [leftq+stride3q ], 2
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm9, [leftq+strideq*0], 1
+ pinsrw xm9, [leftq+strideq*1], 0
+ movq [rsp+32], xm9
+ mov r7d, 1
+ pslldq m8, m9, 4
+ vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 12
+ vpblendd xm6, xm7, 0x01 ; ____ _56_
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 7, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w32_loop_start
+.w32_loop_last:
+ mova m0, [rsp+0]
+ jmp .w32_loop
+.w32_loop_left:
+ mova m0, [rsp+0]
+ vpblendd m0, [rsp+32+r7*4-12], 0x0C
+ dec r7d
+ jg .w32_loop
+ cmp hd, 2
+ je .w32_loop
+ pinsrw xm6, [rsp+32], 6
+ pinsrw xm6, [leftq+strideq*2], 5
+ pinsrw xm6, [leftq+stride3q ], 4
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm6, [leftq+strideq*0], 3
+ pinsrw xm6, [leftq+strideq*1], 2
+ pinsrw xm6, [leftq+strideq*2], 1
+ pinsrw xm6, [leftq+stride3q ], 0
+ lea leftq, [leftq+strideq*4]
+ movu [rsp+36], xm6
+ pinsrw xm6, [leftq+strideq*0], 1
+ pinsrw xm6, [leftq+strideq*1], 0
+ movd [rsp+32], xm6
+ mov r7d, 4
+.w32_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w32_loop_start:
+ mova m13, m12
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+0], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop_left
+ jz .w32_loop_last
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ RET
+.main:
+ FILTER_2BLK 9, 8, 6, 7, 0, 14, 15
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+wq*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+r6*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm6
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw xm4, 1
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ pmaxsw m4, m6
+ pminsw m4, m7
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm6, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], xm4
+ mova [dstq+strideq*2], xm5
+ vextracti128 [dstq+strideq*1], m4, 1
+ vextracti128 [dstq+r6 ], m5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], m4
+ mova [dstq+strideq*1], m5
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm6
+ punpckhwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+32*0], m4
+ mova [dstq+32*1], m5
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov r6d, r7m
+ shr r6d, 11
+ lea t0, [ipred_cfl_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_2]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*2]
+ mova xm1, [ypxq+r3 ]
+ vinserti128 m0, [ypxq+strideq*0], 1
+ vinserti128 m1, [ypxq+strideq*1], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0
+ mova [acq], xm1
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ vpermq m1, m1, q1111
+ pslld xm0, 2
+.w4_hpad_loop:
+ mova [acq], m1
+ paddd m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0, xm1
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ vinserti128 m1, xm1, 1
+ pslld m0, 2
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd xm0, xm5, [ypxq+strideq*0]
+ pmaddwd xm3, xm5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd xm0, xm3
+ pshufd xm3, xm0, q3333
+ packssdw xm1, xm0, xm3
+ paddd xm0, xm3
+ paddd xm4, xm0
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad:
+ mova m0, [ypxq+strideq*0+ 0]
+ mova m1, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m2, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m0, m2, 0xf0
+ vpblendd m1, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m2, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m2, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m2, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m0, m1, m2, m3
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_wpad
+ jmp .w16_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m2, m5, [ypxq+strideq*0+32]
+ pmaddwd m1, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_loop
+.w16_hpad:
+ add hpadd, hpadd
+ jz .dc
+ paddd m0, m0
+.hpad:
+ mova [acq+32*0], m1
+ paddd m4, m0
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ vextracti128 xm1, m4, 1
+ sub r5, acq ; -w*h*2
+ tzcnt r1d, r5d
+ paddd xm4, xm1
+ sub r1d, 2
+ punpckhqdq xm1, xm4, xm4
+ movd xm0, r1d
+ paddd xm1, xm4
+ pshuflw xm4, xm1, q1032
+ paddd xm1, xm4
+ psrld xm1, xm0
+ pxor xm0, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m1, xm1
+.dc_loop:
+ mova m0, [acq+r5]
+ psubw m0, m1
+ mova [acq+r5], m0
+ add r5, 32
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_4]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*0]
+ mova xm1, [ypxq+strideq*1]
+ vinserti128 m0, [ypxq+strideq*2], 1
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m4, m0
+ packssdw m0, m1
+ paddd m4, m1
+ mova [acq], m0
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vextracti128 xm1, m1, 1
+ vpermq m0, m0, q3333
+ pslld xm1, 2
+.w4_hpad_loop:
+ mova [acq], m0
+ paddd m4, m1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m1, m5, [ypxq+strideq*0]
+ pmaddwd m0, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m1, m1, q3131
+ pslld m0, 2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w8_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m0, [ypxq+strideq*1+12]
+ vinserti128 m1, [ypxq+strideq*0+ 0], 0
+ vinserti128 m0, [ypxq+strideq*1+ 0], 0
+ lea ypxq, [ypxq+strideq*2]
+ pmaddwd m1, m5
+ pmaddwd m0, m5
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m2, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m1, m5, [ypxq+strideq*0+32]
+ pmaddwd m0, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+.w16_wpad:
+ mova m2, [ypxq+strideq*0+ 0]
+ mova m0, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m2, m1, 0xf0
+ vpblendd m0, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m1, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m1, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m2, m0, m1, m3
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_wpad
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ lea r6, [ipred_cfl_ac_444_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_1]
+ movsxd wq, [r6+wq*4]
+ shl hpadd, 2
+ add wq, r6
+ mov hd, hm
+ pxor m4, m4
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq xm0, [ypxq+strideq*0]
+ movhps xm0, [ypxq+strideq*1]
+ vpbroadcastq m1, [ypxq+strideq*2]
+ vpbroadcastq m2, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psllw m0, 3
+ pmaddwd m1, m0, m5
+ mova [acq], m0
+ add acq, 32
+ paddd m4, m1
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m0, m0, q3333
+ paddd m1, m1
+ mova [acq+32*0], m0
+ vpermq m1, m1, q3333
+ mova [acq+32*1], m0
+ add acq, 32*2
+ paddd m4, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w8_loop:
+ mova xm2, [ypxq+strideq*0]
+ vinserti128 m2, [ypxq+strideq*1], 1
+ mova xm1, [ypxq+strideq*2]
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vperm2i128 m1, m1, 0x11
+ pslld m0, 2
+ pxor m2, m2
+ vpblendd m0, m2, 0x0f
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w16_wpad2:
+ vpbroadcastw m3, [ypxq+strideq*0+14]
+ vpbroadcastw m0, [ypxq+strideq*1+14]
+ vpblendd m2, m3, 0xf0
+ vpblendd m1, m0, 0xf0
+ jmp .w16_wpad_end
+.w16:
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ test wpadd, wpadd
+ jnz .w16_wpad2
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m0, m0
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w32:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [ypxq+ 0]
+ mova m1, [ypxq+32]
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_loop
+.w32_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m2, m2
+.w32_hpad_loop:
+ mova [acq+32*0], m0
+ mova [acq+32*1], m1
+ paddd m4, m2
+ mova [acq+32*2], m0
+ mova [acq+32*3], m1
+ add acq, 32*4
+ sub hpadd, 2
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w32_wpad:
+ mova m0, [ypxq+ 0]
+ cmp wpadd, 4
+ jl .w32_wpad2
+ je .w32_wpad4
+ vpbroadcastw m1, [ypxq+14]
+ vpblendd m0, m1, 0xf0
+ jmp .w32_wpad_end
+.w32_wpad4:
+ vpbroadcastw m1, [ypxq+30]
+ jmp .w32_wpad_end
+.w32_wpad2:
+ vpbroadcastw m1, [ypxq+46]
+ vinserti128 m1, [ypxq+32], 0
+.w32_wpad_end:
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_wpad
+ jmp .w32_hpad
+
+cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vbroadcasti128 m3, [palq]
+ lea r2, [pal_pred_16bpc_avx2_table]
+ tzcnt wd, wm
+ vbroadcasti128 m4, [pal_pred_shuf]
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ pshufb m3, m4
+ punpckhqdq m4, m3, m3
+ add wq, r2
+DEFINE_ARGS dst, stride, stride3, idx, w, h
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova xm2, [idxq]
+ add idxq, 16
+ pshufb xm1, xm3, xm2
+ pshufb xm2, xm4, xm2
+ punpcklbw xm0, xm1, xm2
+ punpckhbw xm1, xm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu m2, [idxq] ; only 16-byte alignment
+ add idxq, 32
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m0, 1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+32], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+ 0], m0
+ mova [dstq+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+64], m0
+ mova [dstq+96], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm
new file mode 100644
index 0000000000..1a307adc98
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@@ -0,0 +1,833 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3
+ db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11
+ db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7
+ db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15
+smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
+ times 4 db 10, 11, 12, 13, 2, 3, -1, -1
+filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
+ times 4 db 26, 27, 28, 29, 14, 15, -1, -1
+filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9
+pw_1: times 2 dw 1
+ dd 10
+filter_rnd: dd 32
+ dd 1
+ dd 8
+ dd 11
+filter_shift: times 2 dw 6
+ dd 0
+ times 2 dw 4
+ dd 9
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m2
+ psubw m1, m0, m3 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m1, m1
+ pabsw m0, m0
+ pcmpgtw k1, m0, m1
+ pminsw m0, m1
+ pcmpgtw k2, m%3, m0
+ vpblendmw m0{k1}, m%1, m3
+ vpblendmw m0{k2}, m2, m0
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
+%define base r6-ipred_paeth_16bpc_avx512icl_table
+ lea r6, [ipred_paeth_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w4_loop:
+ sub tlq, 16
+ vbroadcasti32x4 m2, [tlq]
+ pshufb m2, m7 ; left
+ PAETH 4, 5, 6
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm8, ym0, 1
+ vextracti32x4 xm9, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm8
+ movq [dstq+r6 ], xm9
+ sub hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm8
+ movhps [dstq+r6 ], xm9
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti32x4 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w8_loop:
+ sub tlq, 8
+ vpbroadcastq m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w16_loop:
+ sub tlq, 4
+ vpbroadcastd m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ movu m4, [tlq+2]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m7, [tlq+66]
+ psubw m5, m4, m3
+ psubw m8, m7, m3
+ pabsw m6, m5
+ pabsw m9, m8
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq+64*0], m0
+ PAETH 7, 8, 9
+ mova [dstq+64*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m6, [tlq+hq*2] ; bottom
+ lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w4_loop:
+ vbroadcasti32x4 m3, [weightsq+hq*2]
+ pshufb m3, m4
+ pmulhrsw m3, m5
+ paddw m3, m6
+ vextracti32x4 xm0, m3, 3
+ vextracti32x4 xm1, ym3, 1
+ vextracti32x4 xm2, m3, 2
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ add hq, 8
+ jg .end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.end:
+ RET
+.w8:
+ vbroadcasti32x4 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m4
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 [dstq+strideq*0], m0, 3
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w16_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], ym0
+ vextracti32x8 [dstq+strideq*2], m1, 1
+ mova [dstq+stride3q ], ym1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ movu m5, [tlq+2]
+ psubw m5, m6
+.w32_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m5, [tlq+66]
+ psubw m4, m6
+ psubw m5, m6
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m6, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
+ jmp wq
+.w4:
+ movsldup m4, [base+ipred_shuf]
+ vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2]
+.w4_loop:
+ vbroadcasti32x4 m0, [tlq+hq-16] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ sub hd, 8*2
+ jl .end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.end:
+ RET
+.w8:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2]
+.w8_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m4
+ pshufb m1, m4
+ psubw m0, m6
+ psubw m1, m6
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ movu m5, [base+smooth_weights_1d_16bpc+32*2]
+.w32_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m6
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [base+smooth_weights_1d_16bpc+64*2]
+ movu m5, [base+smooth_weights_1d_16bpc+64*3]
+.w64_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m6
+ psubw m3, m6
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m13, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
+ mov r5d, 0x55555555
+ sub tlq, hq
+ mova m14, [base+smooth_perm]
+ kmovd k1, r5d
+ vpbroadcastw m0, [tlq] ; bottom
+ mov r5, 0x3333333333333333
+ pxor m15, m15
+ lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
+ kmovq k2, r5
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+hq+2]
+ movshdup m3, [base+ipred_shuf]
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4]
+ lea stride3q, [strideq*3]
+ punpcklwd m5, m0 ; top, bottom
+.w4_loop:
+ vbroadcasti32x4 m0, [v_weightsq]
+ vpbroadcastq m2, [tlq+hq-8]
+ mova m1, m13
+ pshufb m0, m3
+ pmaddwd m0, m5
+ pshufb m1{k2}, m2, m4 ; left, right
+ vpdpwssd m0, m1, m6
+ vpermb m0, m14, m0
+ pavgw ym0, ym15
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 4*4
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym5, [tlq+hq+2]
+ movshdup m6, [base+ipred_shuf]
+ movsldup m7, [base+ipred_shuf]
+ pmovzxwd m5, ym5
+ vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4]
+ lea stride3q, [strideq*3]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w8_loop:
+ vpbroadcastq m0, [v_weightsq+0]
+ vpbroadcastq m1, [v_weightsq+8]
+ vpbroadcastd m3, [tlq+hq-4]
+ vpbroadcastd m4, [tlq+hq-8]
+ pshufb m0, m6
+ pmaddwd m0, m5
+ pshufb m1, m6
+ pmaddwd m1, m5
+ mova m2, m13
+ pshufb m2{k2}, m3, m7 ; left, right
+ mova m3, m13
+ pshufb m3{k2}, m4, m7
+ vpdpwssd m0, m2, m8
+ vpdpwssd m1, m3, m8
+ add v_weightsq, 4*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ pmovzxwd m5, [tlq+hq+2]
+ mova m6, [base+smooth_weights_2d_16bpc+16*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w16_loop:
+ vpbroadcastd m0, [v_weightsq+0]
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ mova m2, m13
+ vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right
+ mova m3, m13
+ vpbroadcastw m3{k1}, [tlq+hq-4]
+ vpdpwssd m0, m2, m6
+ vpdpwssd m1, m3, m6
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ mova m7, [base+smooth_weights_2d_16bpc+32*4]
+ mova m8, [base+smooth_weights_2d_16bpc+32*6]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ vpblendmw m6{k1}, m0, m6
+.w32_loop:
+ vpbroadcastd m2, [v_weightsq+0]
+ vpbroadcastd m3, [v_weightsq+4]
+ pmaddwd m0, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ vpdpwssd m0, m4, m7
+ vpdpwssd m2, m4, m8
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-4]
+ vpdpwssd m1, m4, m7
+ vpdpwssd m3, m4, m8
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ pmovzxwd m7, [tlq+hq+66]
+ pmovzxwd m8, [tlq+hq+98]
+ mova m9, [base+smooth_weights_2d_16bpc+64*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ mova m10, [base+smooth_weights_2d_16bpc+64*5]
+ vpblendmw m6{k1}, m0, m6
+ mova m11, [base+smooth_weights_2d_16bpc+64*6]
+ vpblendmw m7{k1}, m0, m7
+ mova m12, [base+smooth_weights_2d_16bpc+64*7]
+ vpblendmw m8{k1}, m0, m8
+.w64_loop:
+ vpbroadcastd m3, [v_weightsq]
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ pmaddwd m0, m5, m3
+ pmaddwd m2, m6, m3
+ pmaddwd m1, m7, m3
+ pmaddwd m3, m8
+ vpdpwssd m0, m4, m9
+ vpdpwssd m2, m4, m10
+ vpdpwssd m1, m4, m11
+ vpdpwssd m3, m4, m12
+ add v_weightsq, 1*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ sub hd, 1*2
+ jg .w64_loop
+ RET
+
+cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
+ lea r6, [pal_pred_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ mova m2, [pal_pred_perm]
+ movsxd wq, [r6+wq*4]
+ mova xm3, [palq]
+ movifnidn hd, hm
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ pmovzxbw ym0, [idxq]
+ add idxq, 16
+ vpermw ym0, ym0, ym3
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m0, [idxq]
+ add idxq, 32
+ vpermw m0, m0, m3
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row.
+; w4 w8 w16 w32
+; 1 1 2 1 2 5 6 1 2 5 6 9 a d e
+; 2 2 3 2 3 6 7 2 3 6 7 a b e f
+; 3 3 4 3 4 7 8 3 4 7 8 b c f g
+; 4 4 5 4 5 8 9 4 5 8 9 c d g h
+
+cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
+%define base r6-$$
+ lea r6, [$$]
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ movifnidn hd, hm
+ movu xm0, [tlq-6]
+ pmovsxbw m7, [base+filter_intra_taps+filterq+32*0]
+ pmovsxbw m8, [base+filter_intra_taps+filterq+32*1]
+ mov r5d, r8m ; bitdepth_max
+ movsldup m9, [base+filter_permA]
+ movshdup m10, [base+filter_permA]
+ shr r5d, 11 ; is_12bpc
+ jnz .12bpc
+ psllw m7, 2 ; upshift multipliers so that packusdw
+ psllw m8, 2 ; will perform clipping for free
+.12bpc:
+ vpbroadcastd m5, [base+filter_rnd+r5*8]
+ vpbroadcastd m6, [base+filter_shift+r5*8]
+ sub wd, 8
+ jl .w4
+.w8:
+ call .main4
+ movsldup m11, [filter_permB]
+ lea r5d, [hq*2+2]
+ movshdup m12, [filter_permB]
+ lea topq, [tlq+2]
+ mova m13, [filter_permC]
+ sub hd, 4
+ vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1
+ sub tlq, r5
+%if WIN64
+ push r7
+ push r8
+%endif
+ mov r7, dstq
+ mov r8d, hd
+.w8_loop:
+ movlps xm4, xm0, [tlq+hq*2]
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w8_loop
+ test wd, wd
+ jz .end
+ mov r2d, 0x0d
+ kmovb k1, r2d
+ lea r2, [strideq*3]
+.w16:
+ movd xmm0, [r7+strideq*1+12]
+ vpblendd xmm0, [topq+8], 0x0e ; t1 t2
+ pinsrw xm4, xmm0, [r7+strideq*0+14], 2
+ call .main8
+ add r7, 16
+ vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3
+ mov hd, r8d
+ mov dstq, r7
+ add topq, 16
+.w16_loop:
+ movd xmm1, [dstq+strideq*2-4]
+ punpcklwd xm4, xmm1, xmm0
+ movd xmm0, [dstq+r2-4]
+ shufps xm4{k1}, xmm0, xm0, q3210
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w16_loop
+ sub wd, 8
+ jg .w16
+.end:
+ vpermb m2, m11, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m2, m12, m0
+ vpdpwssd m1, m2, m8
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ vextracti32x8 ym2, m1, 1
+ paddd ym1, ym2
+ packusdw ym1, ym1
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ RET
+.w4_loop:
+ movlps xm0, [tlq-10]
+ lea dstq, [dstq+strideq*2]
+ sub tlq, 4
+.w4:
+ call .main4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.main4:
+ vpermb m2, m9, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m0, m10, m0
+ vpdpwssd m1, m0, m8
+ vextracti32x8 ym0, m1, 1
+ paddd ym0, ym1
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1 ; clip
+ vpsrlvw xm0, xm6
+ ret
+ALIGN function_align
+.main8:
+ vpermb m3, m11, m0
+ mova ym2, ym5
+ vpdpwssd m2, m3, m7
+ vpermb m3, m9, m4
+ mova ym1, ym5
+ vpdpwssd m1, m3, m7
+ vpermb m3, m12, m0
+ vpdpwssd m2, m3, m8
+ vpermb m3, m10, m4
+ vpdpwssd m1, m3, m8
+ vextracti32x8 ym4, m2, 1
+ vextracti32x8 ym3, m1, 1
+ paddd ym2, ym4
+ paddd ym1, ym3
+ packusdw ym1, ym2 ; clip
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ ret
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm
new file mode 100644
index 0000000000..07ea9567e1
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@@ -0,0 +1,1923 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_0_1: times 4 db 0, 1
+pb_2_3: times 4 db 2, 3
+pw_1: times 4 dw 1
+pw_2: times 4 dw 2
+pw_4: times 4 dw 4
+pw_512: times 4 dw 512
+pw_2048: times 4 dw 2048
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
+%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4)
+%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
+
+JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
+ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
+JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ movd m4, wm
+ tzcnt wd, wm
+ add tlq, 2
+ movifnidn hd, hm
+ pxor m3, m3
+ pavgw m4, m3
+ movd m5, wd
+ movu m0, [tlq]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ mov hd, hm
+ movd m4, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ pxor m3, m3
+ sub tlq, hq
+ pavgw m4, m3
+ movd m5, r6d
+ movu m0, [tlq]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m2, [tlq+112]
+ movu m1, [tlq+ 96]
+ paddw m0, m2
+ movu m2, [tlq+ 80]
+ paddw m1, m2
+ movu m2, [tlq+ 64]
+ paddw m0, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+ 48]
+ movu m2, [tlq+ 32]
+ paddw m1, m2
+ paddw m0, m1
+.h16:
+ movu m1, [tlq+ 16]
+ paddw m0, m1
+.h8:
+ movhlps m1, m0
+ paddw m0, m1
+.h4:
+ punpcklwd m0, m3
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ lea stride3q, [strideq*3]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_16bpc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw m4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m1, m0
+ punpckhwd m0, m3
+ punpcklwd m1, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ psrld m0, 2
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+.s4:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 32
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ test hd, 8|32
+ cmovz r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16c:
+ mova m1, m0
+.s16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*2+16*0], m0
+ mova [dstq+strideq*2+16*1], m1
+ mova [dstq+stride3q +16*0], m0
+ mova [dstq+stride3q +16*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m0, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 8
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32c:
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s32:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*0+16*2], m2
+ mova [dstq+strideq*0+16*3], m3
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s32
+ RET
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq-112]
+ paddw m0, [tlq- 96]
+ paddw m1, [tlq- 80]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 48]
+ paddw m0, [tlq- 32]
+ paddw m1, [tlq- 16]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+ 18]
+ paddw m1, m2
+ movu m2, [tlq+ 34]
+ paddw m0, m2
+ movu m2, [tlq+ 50]
+ paddw m1, m2
+ movu m2, [tlq+ 66]
+ paddw m0, m2
+ movu m2, [tlq+ 82]
+ paddw m1, m2
+ movu m2, [tlq+ 98]
+ paddw m0, m2
+ movu m2, [tlq+114]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w64_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ LEA r5, ipred_dc_128_16bpc_ssse3_table
+ tzcnt wd, wm
+ shr r6d, 11
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_16bpc_ssse3_table
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+ 18]
+ movu m2, [tlq+ 34]
+ movu m3, [tlq+ 50]
+ cmp wd, 64
+ je .w64
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m4, [tlq+ 66]
+ movu m5, [tlq+ 82]
+ movu m6, [tlq+ 98]
+ movu m7, [tlq+114]
+.w64_loop:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ mova [dstq+16*4], m4
+ mova [dstq+16*5], m5
+ mova [dstq+16*6], m6
+ mova [dstq+16*7], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+%define base r5-ipred_h_16bpc_ssse3_table
+ tzcnt wd, wm
+ LEA r5, ipred_h_16bpc_ssse3_table
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m2, [base+pb_0_1]
+ movddup m3, [base+pb_2_3]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ sub tlq, 8
+ movq m3, [tlq]
+ pshuflw m0, m3, q3333
+ pshuflw m1, m3, q2222
+ pshuflw m2, m3, q1111
+ pshuflw m3, m3, q0000
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m2
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ sub tlq, 8
+ movq m3, [tlq]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*0+16*3], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m1
+ mova [dstq+strideq*1+16*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ sub tlq, 2
+ movd m0, [tlq]
+ pshufb m0, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
+%define base r5-ipred_paeth_16bpc_ssse3_table
+ movifnidn hd, hm
+ pshuflw m4, [tlq], q0000
+ mov leftq, tlq
+ add hd, hd
+ punpcklqdq m4, m4 ; topleft
+ sub leftq, hq
+ and wd, ~7
+ jnz .w8
+ movddup m5, [tlq+2] ; top
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w4_loop:
+ movd m1, [leftq+hq-4]
+ punpcklwd m1, m1
+ punpckldq m1, m1 ; left
+%macro PAETH 0
+ paddw m0, m6, m1
+ psubw m2, m4, m0 ; tldiff
+ psubw m0, m5 ; tdiff
+ pabsw m2, m2
+ pabsw m0, m0
+ pminsw m2, m0
+ pcmpeqw m0, m2
+ pand m3, m5, m0
+ pandn m0, m4
+ por m0, m3
+ pcmpgtw m3, m7, m2
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+%endmacro
+ PAETH
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %define r7d hm
+ %assign regs_used 7
+%elif WIN64
+ movaps r4m, m8
+ PUSH r7
+ %assign regs_used 8
+%endif
+%if ARCH_X86_64
+ movddup m8, [pb_0_1]
+%endif
+ lea tlq, [tlq+wq*2+2]
+ neg wq
+ mov r7d, hd
+.w8_loop0:
+ movu m5, [tlq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w8_loop:
+ movd m1, [leftq+hq-2]
+%if ARCH_X86_64
+ pshufb m1, m8
+%else
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+%endif
+ PAETH
+ mova [r6], m0
+ add r6, strideq
+ sub hd, 1*2
+ jg .w8_loop
+ mov hd, r7d
+ add wq, 8
+ jl .w8_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 4
+%endif
+
+cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov hd, hm
+ lea weightsq, [weightsq+hq*4]
+ neg hq
+ movd m5, [tlq+hq*2] ; bottom
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [tlq+2] ; top
+ lea r3, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ movq m1, [weightsq+hq*2]
+ punpcklwd m1, m1
+ pshufd m0, m1, q1100
+ punpckhdq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ mov hm, hq
+ %define hq hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0, hq
+ movu m4, [tlq+2]
+ add tlq, 16
+ mov r6, dstq
+ add dstq, 16
+ psubw m4, m5
+.w8_loop:
+ movq m3, [weightsq+t0*2]
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ add t0, 4
+ jl .w8_loop
+ sub wd, 8
+ jg .w8_loop0
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov wd, wm
+ movifnidn hd, hm
+ movd m5, [tlq+wq*2] ; right
+ sub tlq, 8
+ add hd, hd
+ pshuflw m5, m5, q0000
+ sub tlq, hq
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [weightsq+4*2]
+ lea r3, [strideq*3]
+.w4_loop:
+ movq m1, [tlq+hq] ; left
+ punpcklwd m1, m1
+ psubw m1, m5 ; left - right
+ pshufd m0, m1, q3322
+ punpckldq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movhps [dstq+strideq*2], m1
+ movq [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ lea weightsq, [weightsq+wq*4]
+ neg wq
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ %define hd hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0d, hd
+ mova m4, [weightsq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+.w8_loop:
+ movq m3, [tlq+t0*(1+ARCH_X86_32)]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ sub t0d, 4*(1+ARCH_X86_64)
+ jg .w8_loop
+ add wq, 8
+ jl .w8_loop0
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 10
+%else
+DECLARE_REG_TMP 3
+%endif
+
+cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
+ h_weights, v_weights, top
+ LEA h_weightsq, smooth_weights_2d_16bpc
+ mov wd, wm
+ mov hd, hm
+ movd m7, [tlq+wq*2] ; right
+ lea v_weightsq, [h_weightsq+hq*8]
+ neg hq
+ movd m6, [tlq+hq*2] ; bottom
+ pshuflw m7, m7, q0000
+ pshuflw m6, m6, q0000
+ cmp wd, 4
+ jne .w8
+ movq m4, [tlq+2] ; top
+ mova m5, [h_weightsq+4*4]
+ punpcklwd m4, m6 ; top, bottom
+ pxor m6, m6
+.w4_loop:
+ movq m1, [v_weightsq+hq*4]
+ sub tlq, 4
+ movd m3, [tlq] ; left
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pmaddwd m0, m4
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m1, m4
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ lea h_weightsq, [h_weightsq+wq*4]
+ mov t0, tlq
+ mov r1m, tlq
+ mov r2m, hq
+ %define m8 [h_weightsq+16*0]
+ %define m9 [h_weightsq+16*1]
+%else
+%if WIN64
+ movaps r4m, m8
+ movaps r6m, m9
+ PUSH r7
+ PUSH r8
+%endif
+ PUSH r9
+ PUSH r10
+ %assign regs_used 11
+ lea h_weightsq, [h_weightsq+wq*8]
+ lea topq, [tlq+wq*2]
+ neg wq
+ mov r8, tlq
+ mov r9, hq
+%endif
+ punpcklqdq m6, m6
+.w8_loop0:
+%if ARCH_X86_32
+ movu m5, [t0+2]
+ add t0, 16
+ mov r0m, t0
+%else
+ movu m5, [topq+wq*2+2]
+ mova m8, [h_weightsq+wq*4+16*0]
+ mova m9, [h_weightsq+wq*4+16*1]
+%endif
+ mov t0, dstq
+ add dstq, 16
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+.w8_loop:
+ movd m1, [v_weightsq+hq*4]
+ sub tlq, 2
+ movd m3, [tlq] ; left
+ pshufd m1, m1, q0000
+ pmaddwd m0, m4, m1
+ pshuflw m3, m3, q0000
+ pmaddwd m1, m5
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pxor m1, m1
+ pavgw m0, m1
+ mova [t0], m0
+ add t0, strideq
+ inc hq
+ jl .w8_loop
+%if ARCH_X86_32
+ mov t0, r0m
+ mov tlq, r1m
+ add h_weightsq, 16*2
+ mov hq, r2m
+ sub dword wm, 8
+ jg .w8_loop0
+%else
+ mov tlq, r8
+ mov hq, r9
+ add wq, 8
+ jl .w8_loop0
+%endif
+%if WIN64
+ movaps m8, r4m
+ movaps m9, r6m
+%endif
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
+%else
+cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%endif
+%define base r6-$$
+ movifnidn hd, hm
+ movd m6, r8m ; bitdepth_max
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ LEA r6, $$
+ shl filterd, 6
+ movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
+ mova m1, [base+filter_intra_taps+filterq+16*0]
+ mova m2, [base+filter_intra_taps+filterq+16*1]
+ mova m3, [base+filter_intra_taps+filterq+16*2]
+ mova m4, [base+filter_intra_taps+filterq+16*3]
+ pxor m5, m5
+%if ARCH_X86_64
+ punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper
+ punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid
+ punpcklbw m10, m5, m2 ; having to perform sign-extension.
+ punpckhbw m11, m5, m2
+ punpcklbw m12, m5, m3
+ punpckhbw m13, m5, m3
+ punpcklbw m14, m5, m4
+ punpckhbw m15, m5, m4
+%else
+ punpcklbw m7, m5, m1
+ mova m8, m7
+ punpckhbw m7, m5, m1
+ mova m9, m7
+ punpcklbw m7, m5, m2
+ mova m10, m7
+ punpckhbw m7, m5, m2
+ mova m11, m7
+ punpcklbw m7, m5, m3
+ mova m12, m7
+ punpckhbw m7, m5, m3
+ mova m13, m7
+ punpcklbw m7, m5, m4
+ mova m14, m7
+ punpckhbw m7, m5, m4
+ mova m15, m7
+%endif
+ mova m7, [base+filter_shuf]
+ add hd, hd
+ mov r5, dstq
+ pshuflw m6, m6, q0000
+ mov r6, tlq
+ punpcklqdq m6, m6
+ sub tlq, hq
+.left_loop:
+ pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __
+ pshufd m1, m0, q0000
+ pmaddwd m2, m8, m1
+ pmaddwd m1, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m4, m0, q2222
+ pmaddwd m3, m12, m4
+ pmaddwd m4, m13
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m3, m0, q3333
+ pmaddwd m0, m14, m3
+ pmaddwd m3, m15
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 11 ; x >> 3
+ psrad m1, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5 ; (x + 8) >> 4
+ pminsw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movlps m0, [tlq+hq-10]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .left_loop
+ sub wd, 4
+ jz .end
+ sub tld, r6d ; -h*2
+ sub r6, r5 ; tl-dst
+.right_loop0:
+ add r5, 8
+ mov hd, tld
+ movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
+ mov dstq, r5
+.right_loop:
+ pshufd m2, m0, q0000
+ pmaddwd m1, m8, m2
+ pmaddwd m2, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ pinsrw m0, [dstq+strideq*0-2], 5
+ paddd m1, m3
+ paddd m2, m4
+ pshufd m0, m0, q2222
+ movddup m4, [dstq+strideq*1-8]
+ pmaddwd m3, m12, m0
+ pmaddwd m0, m13
+ paddd m1, m3
+ paddd m0, m2
+ pshuflw m2, m4, q3333
+ punpcklwd m2, m5
+ pmaddwd m3, m14, m2
+ pmaddwd m2, m15
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 11
+ psrad m0, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5
+ pminsw m0, m6
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ palignr m0, m4, 14
+ lea dstq, [dstq+strideq*2]
+ add hd, 2*2
+ jl .right_loop
+ sub wd, 4
+ jg .right_loop0
+.end:
+ RET
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ movd m4, wd
+ tzcnt wd, wd
+ movifnidn hd, hm
+ add tlq, 2
+ movsxd r6, [t0+wq*4]
+ movd m5, wd
+ jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ tzcnt wd, wm
+ lea r6d, [hq*2]
+ movd m4, hd
+ sub tlq, r6
+ tzcnt r6d, hd
+ movd m5, r6d
+ movsxd r6, [t0+r6*4]
+.start:
+ movd m7, r7m
+ movu m0, [tlq]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
+ movsxd wq, [t0+wq*4]
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ pcmpeqw m3, m3
+ add wq, t0
+ movifnidn acq, acmp
+ pavgw m4, m6
+ punpcklqdq m7, m7
+ jmp r6
+.h32:
+ movu m1, [tlq+48]
+ movu m2, [tlq+32]
+ paddw m0, m1
+ paddw m0, m2
+.h16:
+ movu m1, [tlq+16]
+ paddw m0, m1
+.h8:
+ pshufd m1, m0, q1032
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshuflw m0, m4, q1032
+ paddd m0, m4
+ psrld m0, m5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+%macro IPRED_CFL 2 ; dst, src
+ pabsw m%1, m%2
+ pmulhrsw m%1, m2
+ psignw m%2, m1
+ psignw m%1, m%2
+ paddw m%1, m0
+ pmaxsw m%1, m6
+ pminsw m%1, m7
+%endmacro
+
+cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_16bpc_ssse3_table
+ tzcnt wd, wd
+ movd m7, r7m
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw m4, 1
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ pcmpeqw m3, m3
+ punpcklqdq m7, m7
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ cmp hd, 4
+ jg .w4_mul
+ psrld m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 16
+ cmove r6d, r2d
+ movd m1, r6d
+ psrld m0, 2
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ lea r6, [strideq*3]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ movq [dstq+strideq*0], m3
+ movhps [dstq+strideq*1], m3
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4_loop
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+strideq*0], m3
+ mova [dstq+strideq*1], m4
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s8_loop
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ add dstq, strideq
+ dec hd
+ jg .s16_loop
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m1, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 8
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ mova m4, [acq+16*2]
+ mova m5, [acq+16*3]
+ add acq, 16*4
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
+ tzcnt wd, wm
+ LEA t0, ipred_cfl_splat_16bpc_ssse3_table
+ mov r6d, r7m
+ movifnidn hd, hm
+ shr r6d, 11
+ movd m7, r7m
+ movsxd wq, [t0+wq*4]
+ movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
+ pshuflw m7, m7, q0000
+ pxor m6, m6
+ add wq, t0
+ movifnidn acq, acmp
+ punpcklqdq m7, m7
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ paddw m5, m5
+%else
+ movddup m5, [pw_2]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ pmaddwd m2, m5, [ypxq+strideq*2]
+ pmaddwd m3, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m0, m1
+ paddd m2, m3
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ punpckhqdq m0, m0
+ pslld m2, 2
+.w4_hpad:
+ mova [acq+16*0], m0
+ paddd m4, m2
+ mova [acq+16*1], m0
+ add acq, 16*2
+ sub hpadd, 4
+ jg .w4_hpad
+ jmp .dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m1, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m2
+ paddd m1, m3
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ pslld m2, 2
+ mova m1, m0
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ pshufd m1, m0, q3333
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*0]
+ paddd m0, m6
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*1]
+ paddd m3, m6
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+strideq*0+16*2]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*2]
+ paddd m1, m6
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+strideq*0+16*3]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*3]
+ paddd m2, m6
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ paddd m6, m0, m3
+ packssdw m0, m3
+ paddd m6, m1
+ mova [acq+16*0], m0
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz .dc
+ paddd m2, m2
+.hpad:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m0
+ mova [acq+16*3], m1
+ add acq, 16*4
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ sub r5, acq ; -w*h*2
+ pshufd m2, m4, q1032
+ tzcnt r1d, r5d
+ paddd m2, m4
+ sub r1d, 2
+ pshufd m4, m2, q2301
+ movd m0, r1d
+ paddd m2, m4
+ psrld m2, m0
+ pxor m0, m0
+ pavgw m2, m0
+ packssdw m2, m2
+.dc_loop:
+ mova m0, [acq+r5+16*0]
+ mova m1, [acq+r5+16*1]
+ psubw m0, m2
+ psubw m1, m2
+ mova [acq+r5+16*0], m0
+ mova [acq+r5+16*1], m1
+ add r5, 16*2
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ psllw m5, 2
+%else
+ movddup m5, [pw_4]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m3, m5, [ypxq+strideq*1]
+ pmaddwd m1, m5, [ypxq+strideq*2]
+ pmaddwd m2, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m4, m0
+ packssdw m0, m3
+ paddd m3, m1
+ packssdw m1, m2
+ paddd m4, m2
+ paddd m4, m3
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ pslld m2, 3
+ mova [acq+16*0], m1
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m1, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq+16*0], m0
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ pshufd m2, m0, q3333
+ pshufd m3, m1, q3333
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+16*0]
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+16*1]
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+16*2]
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+16*3]
+.w16_wpad_end:
+ add ypxq, strideq
+ paddd m6, m0, m3
+ packssdw m0, m3
+ mova [acq+16*0], m0
+ paddd m6, m1
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
+ LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ pxor m4, m4
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pw_1]
+ add wq, r6
+ mov hd, hm
+ shl hpadd, 2
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq m0, [ypxq+strideq*0]
+ movhps m0, [ypxq+strideq*1]
+ movq m1, [ypxq+strideq*2]
+ movhps m1, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ mova [acq+16*0], m1
+ pslld m2, 2
+ mova [acq+16*1], m1
+ punpckhqdq m2, m2
+ mova [acq+16*2], m1
+ paddd m4, m2
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+ mov r5, acq
+.w8_loop:
+ mova m0, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w16_wpad2:
+ pshufhw m3, m2, q3333
+ pshufhw m1, m0, q3333
+ punpckhqdq m3, m3
+ punpckhqdq m1, m1
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0+16*0]
+ mova m0, [ypxq+strideq*1+16*0]
+ psllw m2, 3
+ psllw m0, 3
+ test wpadd, wpadd
+ jnz .w16_wpad2
+ mova m3, [ypxq+strideq*0+16*1]
+ mova m1, [ypxq+strideq*1+16*1]
+ psllw m3, 3
+ psllw m1, 3
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ mova [acq+16*0], m2
+ pmaddwd m2, m5
+ mova [acq+16*1], m3
+ pmaddwd m3, m5
+ paddd m4, m2
+ pmaddwd m2, m5, m0
+ mova [acq+16*2], m0
+ paddd m4, m3
+ pmaddwd m3, m5, m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ paddd m2, m3
+ paddd m4, m2
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w32_wpad6:
+ pshufhw m1, m0, q3333
+ punpckhqdq m1, m1
+ mova m2, m1
+ mova m3, m1
+ jmp .w32_wpad_end
+.w32_wpad4:
+ pshufhw m2, m1, q3333
+ punpckhqdq m2, m2
+ mova m3, m2
+ jmp .w32_wpad_end
+.w32_wpad2:
+ pshufhw m3, m2, q3333
+ punpckhqdq m3, m3
+ jmp .w32_wpad_end
+.w32:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+ WIN64_SPILL_XMM 8
+.w32_loop:
+ mova m0, [ypxq+16*0]
+ psllw m0, 3
+ cmp wpadd, 4
+ jg .w32_wpad6
+ mova m1, [ypxq+16*1]
+ psllw m1, 3
+ je .w32_wpad4
+ mova m2, [ypxq+16*2]
+ psllw m2, 3
+ jnp .w32_wpad2
+ mova m3, [ypxq+16*3]
+ psllw m3, 3
+.w32_wpad_end:
+ add ypxq, strideq
+ pmaddwd m6, m5, m0
+ mova [acq+16*0], m0
+ pmaddwd m7, m5, m1
+ mova [acq+16*1], m1
+ paddd m6, m7
+ pmaddwd m7, m5, m2
+ mova [acq+16*2], m2
+ paddd m6, m7
+ pmaddwd m7, m5, m3
+ mova [acq+16*3], m3
+ add acq, 16*4
+ paddd m6, m7
+ paddd m4, m6
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova m5, m6
+ WIN64_RESTORE_XMM
+ SWAP 5, 6
+%endif
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w32_hpad_loop:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m6
+ mova [acq+16*2], m2
+ mova [acq+16*3], m3
+ add acq, 16*4
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+
+cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
+%define base r2-pal_pred_16bpc_ssse3_table
+%if ARCH_X86_32
+ %define hd r2d
+%endif
+ mova m3, [palq]
+ LEA r2, pal_pred_16bpc_ssse3_table
+ tzcnt wd, wm
+ pshufb m3, [base+pal_pred_shuf]
+ movsxd wq, [r2+wq*4]
+ pshufd m4, m3, q1032
+ add wq, r2
+ movifnidn hd, hm
+ jmp wq
+.w4:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m0, [idxq+16*0]
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova m2, [idxq+16*1]
+ add idxq, 16*2
+ mova [dstq+16*0], m0
+ pshufb m0, m3, m2
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ punpcklbw m2, m0, m1
+ punpckhbw m0, m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+.w64:
+ mova m0, [idxq+16*0]
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova m2, [idxq+16*1]
+ mova [dstq+16*0], m0
+ pshufb m0, m3, m2
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ punpcklbw m2, m0, m1
+ punpckhbw m0, m1
+ mova m1, [idxq+16*2]
+ mova [dstq+16*2], m2
+ pshufb m2, m3, m1
+ mova [dstq+16*3], m0
+ pshufb m0, m4, m1
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ mova m0, [idxq+16*3]
+ add idxq, 16*4
+ mova [dstq+16*4], m1
+ pshufb m1, m3, m0
+ mova [dstq+16*5], m2
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm
new file mode 100644
index 0000000000..dd188a7f37
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@@ -0,0 +1,5387 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
+pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
+ db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
+ db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+ db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_128: times 4 db 128 ; those are just placed here for alignment.
+pb_36_m4: times 2 db 36, -4
+z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13
+z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
+z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64
+ dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64
+z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
+ db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5
+; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
+filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
+ db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1
+pb_127_m127: times 2 db 127, -127
+ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+ db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15
+ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
+ db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0
+pw_64: times 2 dw 64
+
+cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
+ times 9 db 7, -1
+cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ; w=8, w_pad=1 as well as second half of previous one
+cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
+ times 5 db 6, 7
+ ; w=16,w_pad=2
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ times 8 db 14, 15
+ ; w=16,w_pad=3
+ db 0, 1, 2, 3, 4, 5
+ times 13 db 6, 7
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1 (ipred_h_shuf+12)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+ 4)
+%define pb_4 (ipred_h_shuf+24)
+%define pb_5 (ipred_h_shuf+ 8)
+%define pb_7 (ipred_h_shuf+ 0)
+%define pb_8 (z_upsample2 +12)
+%define pb_12 (z2_y_shuf_h4+20)
+%define pb_14 (z2_y_shuf_h4+ 4)
+%define pb_15 (z_filter_s +32)
+%define pb_27 (z2_y_shuf_h4+ 8)
+%define pb_31 (z2_y_shuf_h4+12)
+%define pb_32 (z2_y_shuf_h4+16)
+%define pb_90 (z2_y_shuf_h4+ 0)
+%define pw_1 (z2_y_shuf_h4+24)
+%define pw_8 (z_filter_k +32)
+
+pw_62: times 2 dw 62
+pw_128: times 2 dw 128
+pw_255: times 2 dw 255
+pw_512: times 2 dw 512
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
+
+JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
+JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov r5d, 0x8000
+ shrx r5d, r5d, r6d
+ movd xm3, r5d
+ lea r5, [ipred_dc_left_avx2_table]
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ mova m1, m0
+ jmp wq
+
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastb xm0, xm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastb xm0, xm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastb xm0, xm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastb m0, xm0
+.s32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-32]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+33]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w64_end:
+ vpbroadcastb m0, xm0
+ mova m1, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+ mova m1, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+33]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_h_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4, d
+.w8:
+ IPRED_H 8, q
+.w16:
+ IPRED_H 16, a
+INIT_YMM avx2
+.w32:
+ IPRED_H 32, a
+.w64:
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64
+ RET
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3 ; Calculating tldiff normally requires
+ pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it
+ pand m0, m4 ; in 8-bit with some tricks which avoids
+ psubusb m2, m5, m1 ; having to unpack everything to 16-bit.
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+ vpblendvb m0, m%1, m3, m0
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff
+ vpblendvb m0, m5, m0, m1
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
+%define base r5-ipred_paeth_avx2_table
+ lea r5, [ipred_paeth_avx2_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m4, [base+pb_1]
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastd m6, [tlq+1] ; top
+ mova m8, [base+ipred_h_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m3, [tlq]
+ pshufb m3, m8 ; left
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m6, [tlq+1]
+ mova m8, [base+ipred_h_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m6, [tlq+1]
+ mova xm8, xm4 ; lower half = 1, upper half = 0
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 7
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+ 1]
+ movu m7, [tlq+33]
+%if WIN64
+ movaps r4m, xmm9
+%endif
+ psubusb m8, m5, m6
+ psubusb m0, m6, m5
+ psubusb m9, m5, m7
+ psubusb m1, m7, m5
+ por m8, m0
+ por m9, m1
+.w64_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 8
+ mova [dstq+32*0], m0
+ PAETH 7, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+%if WIN64
+ movaps xmm9, r4m
+%endif
+ RET
+
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ pmaddubsw m0, m%3, m%1
+ pmaddubsw m1, m%4, m%2
+ paddw m0, m%5
+ paddw m1, m%6
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_avx2_table
+ lea r6, [ipred_smooth_v_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m0, [base+pb_127_m127]
+ vpbroadcastd m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ vpbroadcastb m5, [tlq+hq] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastd m2, [tlq+1]
+ punpcklbw m2, m5 ; top, bottom
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ punpckldq m4, m5, m5
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0
+ paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+ vbroadcasti128 m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 1
+ pextrd [dstq+r3 ], xm1, 1
+ cmp hd, -4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm1, 2
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ add hq, 8
+ jl .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1
+.w8_loop:
+ vpbroadcastq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ WIN64_SPILL_XMM 7
+ vbroadcasti128 m3, [tlq+1]
+ mova m6, [base+ipred_v_shuf]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w16_loop:
+ vpbroadcastd m1, [weightsq+hq*2]
+ pshufb m1, m6
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 6
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2]
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ inc hq
+ jl .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 11
+ movu m4, [tlq+ 1]
+ movu m8, [tlq+33]
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m7, m8, m5
+ punpckhbw m8, m5
+ pmaddubsw m5, m3, m0
+ pmaddubsw m6, m4, m0
+ pmaddubsw m9, m7, m0
+ pmaddubsw m10, m8, m0
+ paddw m2, m1, m3
+ paddw m5, m2
+ paddw m2, m1, m4
+ paddw m6, m2
+ paddw m0, m1, m7
+ paddw m9, m0
+ paddw m1, m8
+ paddw m10, m1
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ SMOOTH 2, 2, 3, 4, 5, 6
+ mova [dstq+32*0], m0
+ SMOOTH 2, 2, 7, 8, 9, 10
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
+ %assign stack_offset 0
+ %assign stack_size_padded 0
+ %assign regs_used %2
+ %xdefine rstk rsp
+ SETUP_STACK_POINTER %1
+ %if regs_used != %2 && WIN64
+ PUSH r%2
+ %endif
+ ALLOC_STACK %1, %3
+%endmacro
+
+cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_avx2_table
+ lea r6, [ipred_smooth_h_avx2_table]
+ mov wd, wm
+ vpbroadcastb m3, [tlq+wq] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m4, [base+pb_127_m127]
+ vpbroadcastd m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 8
+ vpbroadcastq m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 8
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m2, [tlq+hq]
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+.w8_loop:
+ vpbroadcastd m2, [tlq+hq]
+ pshufb m2, m7
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4
+ paddw m0, m1
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ SETUP_STACK_FRAME 32*4, 7, 8
+ lea r3, [rsp+64*2-4]
+ call .prep ; only worthwhile for for w16 and above
+ sub tlq, 2
+ vpbroadcastd xm6, [base+pb_1]
+ mova xm7, [base+ipred_v_shuf+16]
+ vinserti128 m7, [base+ipred_v_shuf+ 0], 1
+ vbroadcasti128 m4, [base+smooth_weights+16*2]
+ vbroadcasti128 m5, [base+smooth_weights+16*3]
+.w16_loop:
+ vpbroadcastd m1, [tlq+hq]
+ vpbroadcastd m2, [r3+hq*2]
+ pshufb m1, m6
+ punpcklbw m1, m3
+ pshufb m2, m7
+ SMOOTH 4, 5, 1, 1, 2, 2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ SETUP_STACK_FRAME 32*4, 7, 6
+ lea r3, [rsp+64*2-2]
+ call .prep
+ dec tlq
+ mova xm4, [base+smooth_weights+16*4]
+ vinserti128 m4, [base+smooth_weights+16*6], 1
+ mova xm5, [base+smooth_weights+16*5]
+ vinserti128 m5, [base+smooth_weights+16*7], 1
+.w32_loop:
+ vpbroadcastb m1, [tlq+hq]
+ punpcklbw m1, m3
+ vpbroadcastw m2, [r3+hq*2]
+ SMOOTH 4, 5, 1, 1, 2, 2
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ SETUP_STACK_FRAME 32*4, 7, 9
+ lea r3, [rsp+64*2-2]
+ call .prep
+ add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
+ dec tlq
+ mova xm5, [r6-16*7]
+ vinserti128 m5, [r6-16*5], 1
+ mova xm6, [r6-16*6]
+ vinserti128 m6, [r6-16*4], 1
+ mova xm7, [r6-16*3]
+ vinserti128 m7, [r6-16*1], 1
+ mova xm8, [r6-16*2]
+ vinserti128 m8, [r6-16*0], 1
+.w64_loop:
+ vpbroadcastb m2, [tlq+hq]
+ punpcklbw m2, m3
+ vpbroadcastw m4, [r3+hq*2]
+ SMOOTH 5, 6, 2, 2, 4, 4
+ mova [dstq+32*0], m0
+ SMOOTH 7, 8, 2, 2, 4, 4
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+ALIGN function_align
+.prep:
+ vpermq m2, [tlq-32*1], q3120
+ punpckhbw m1, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m1, m5 ; 1 * left + 256 * right + 128
+ paddw m0, m1 ; 128 * left + 129 * right + 128
+ pmaddubsw m1, m2, m4
+ paddw m2, m5
+ paddw m1, m2
+ vpermq m2, [tlq-32*2], q3120
+ mova [rsp+gprsize+32*3], m0
+ mova [rsp+gprsize+32*2], m1
+ punpckhbw m1, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m0, m1, m4
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m2, m5
+ paddw m1, m2
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*0], m1
+ sub r3, hq
+ sub tlq, hq
+ sub r3, hq
+ ret
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddubsw m0, m%3, m%1
+ pmaddubsw m1, m%4, m%2
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_avx2_table
+ lea r6, [ipred_smooth_avx2_table]
+ mov wd, wm
+ vpbroadcastb m4, [tlq+wq] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m5, [base+pb_127_m127]
+ vpbroadcastb m0, [r5] ; bottom
+ vpbroadcastd m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 12
+ mova m10, [base+ipred_h_shuf]
+ vpbroadcastq m11, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_v_shuf]
+ vpbroadcastd m8, [tlq+1]
+ sub tlq, 8
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m8, m0 ; top, bottom
+ pshufd m6, m7, q2200
+ pshufd m7, m7, q3311
+ pmaddubsw m9, m8, m5
+ paddw m3, m8 ; 1 * top + 255 * bottom + 255
+ paddw m9, m3 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+ vpbroadcastq m1, [tlq+hq]
+ pshufb m1, m10
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ paddw m2, m0
+ paddw m3, m1
+ vbroadcasti128 m1, [v_weightsq]
+ add v_weightsq, 16
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ SMOOTH_2D_END 0, 1, 8, 8, 9, 9
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ mova m10, [base+ipred_h_shuf]
+ vbroadcasti128 m11, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_v_shuf]
+ vpbroadcastq m8, [tlq+1]
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m8, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m9, m8, m5
+ paddw m3, m8
+ paddw m9, m3
+.w8_loop:
+ vpbroadcastd m1, [tlq+hq]
+ pshufb m1, m10
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ paddw m2, m0
+ paddw m3, m1
+ vpbroadcastq m1, [v_weightsq]
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ SMOOTH_2D_END 0, 1, 8, 8, 9, 9
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ SETUP_STACK_FRAME 32*4, 7, 14
+ vbroadcasti128 m11, [tlq+1]
+ lea r3, [rsp+64*2-4]
+ punpcklbw m10, m11, m0 ; top, bottom
+ punpckhbw m11, m0
+ call .prep_v
+ sub tlq, 2
+ pmaddubsw m12, m10, m5
+ pmaddubsw m13, m11, m5
+ vpbroadcastd xm5, [base+pb_1]
+ mova m9, [base+ipred_v_shuf]
+ vbroadcasti128 m6, [base+smooth_weights+16*2]
+ vbroadcasti128 m7, [base+smooth_weights+16*3]
+ vperm2i128 m8, m9, m9, 0x01
+ paddw m0, m10, m3
+ paddw m3, m11
+ paddw m12, m0
+ paddw m13, m3
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq]
+ vpbroadcastd m0, [r3+hq*2]
+ vpbroadcastd m1, [v_weightsq]
+ add v_weightsq, 4
+ pshufb m3, m5
+ punpcklbw m3, m4 ; left, right
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m7
+ pshufb m0, m8
+ pshufb m1, m9
+ paddw m2, m0
+ paddw m3, m0
+ SMOOTH_2D_END 1, 1, 10, 11, 12, 13
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ SETUP_STACK_FRAME 32*4, 7, 11
+ movu m8, [tlq+1]
+ lea r3, [rsp+64*2-2]
+ punpcklbw m7, m8, m0
+ punpckhbw m8, m0
+ call .prep_v
+ dec tlq
+ pmaddubsw m9, m7, m5
+ pmaddubsw m10, m8, m5
+ mova xm5, [base+smooth_weights+16*4]
+ vinserti128 m5, [base+smooth_weights+16*6], 1
+ mova xm6, [base+smooth_weights+16*5]
+ vinserti128 m6, [base+smooth_weights+16*7], 1
+ paddw m0, m7, m3
+ paddw m3, m8
+ paddw m9, m0
+ paddw m10, m3
+.w32_loop:
+ vpbroadcastb m3, [tlq+hq]
+ punpcklbw m3, m4
+ vpbroadcastw m0, [r3+hq*2]
+ vpbroadcastw m1, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m3, m5
+ pmaddubsw m3, m6
+ paddw m2, m0
+ paddw m3, m0
+ SMOOTH_2D_END 1, 1, 7, 8, 9, 10
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ SETUP_STACK_FRAME 32*8, 7, 16
+ movu m13, [tlq+1 ]
+ movu m15, [tlq+33]
+ add r6, smooth_weights+16*15-ipred_smooth_avx2_table
+ lea r3, [rsp+64*2-2]
+ punpcklbw m12, m13, m0
+ punpckhbw m13, m0
+ punpcklbw m14, m15, m0
+ punpckhbw m15, m0
+ call .prep_v
+ dec tlq
+ pmaddubsw m0, m12, m5
+ pmaddubsw m1, m13, m5
+ pmaddubsw m2, m14, m5
+ pmaddubsw m5, m15, m5
+ mova xm8, [r6-16*7]
+ vinserti128 m8, [r6-16*5], 1
+ mova xm9, [r6-16*6]
+ vinserti128 m9, [r6-16*4], 1
+ mova xm10, [r6-16*3]
+ vinserti128 m10, [r6-16*1], 1
+ mova xm11, [r6-16*2]
+ vinserti128 m11, [r6-16*0], 1
+ lea r6, [rsp+32*4]
+ paddw m0, m3
+ paddw m1, m3
+ paddw m2, m3
+ paddw m3, m5
+ paddw m0, m12
+ paddw m1, m13
+ paddw m2, m14
+ paddw m3, m15
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+.w64_loop:
+ vpbroadcastb m5, [tlq+hq]
+ punpcklbw m5, m4
+ vpbroadcastw m6, [r3+hq*2]
+ vpbroadcastw m7, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m5, m8
+ pmaddubsw m3, m5, m9
+ paddw m2, m6
+ paddw m3, m6
+ SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1]
+ mova [dstq+32*0], m0
+ pmaddubsw m2, m5, m10
+ pmaddubsw m3, m5, m11
+ paddw m2, m6
+ paddw m3, m6
+ SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3]
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+ALIGN function_align
+.prep_v:
+ vpermq m2, [tlq-32*1], q3120
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ pmaddubsw m0, m1, m5 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m2, m5
+ paddw m1, m2
+ vpermq m2, [tlq-32*2], q3120
+ mova [rsp+gprsize+32*3], m0
+ mova [rsp+gprsize+32*2], m1
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ pmaddubsw m0, m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m5
+ paddw m1, m2
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*0], m1
+ sub r3, hq
+ sub tlq, hq
+ sub r3, hq
+ ret
+
+cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ inc tlq
+ movsxd wq, [r6+wq*4]
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ jmp wq
+.w4:
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ ALLOC_STACK -32, 8
+ mova xm1, [tlq-1]
+ pshufb xm0, xm1, [z_upsample1]
+ pshufb xm1, [z_upsample2]
+ vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
+ add dxd, dxd ; pw_512 (which is already in m3)
+ pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
+ pextrd [rsp+16], xm1, 3 ; top[max_base_x]
+ pmaddubsw xm1, xm2
+ movd xm7, dxd
+ mov r3d, dxd ; xpos
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ movq xm0, [tlq]
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ lea r2, [strideq*3]
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1
+ punpcklbw xm0, xm1
+ psllw m7, 2
+ mova [rsp], xm0
+.w4_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [rsp+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [rsp+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r2 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+ ; The C version uses a lot of branches, but we can do all the comparisons
+ ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+ lea r3, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm2, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [base+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ ret
+.w4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 11
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m7, [base+pb_8]
+ vbroadcasti128 m2, [tlq-1]
+ pminub m1, m7, [base+z_filter_s]
+ vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
+ pminub m7, [base+z_filter_s+8]
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r3d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne maxbased, r3d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm6, dxd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ mov r3d, dxd ; xpos
+ movd xm9, maxbased
+ vpbroadcastw m9, xm9
+ vbroadcasti128 m8, [z1_shuf_w4]
+ psrlw m7, 8 ; top[max_base_x]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_x
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1
+ paddw m10, m10
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_x
+ pmulhrsw m0, m3
+ paddw m6, m10 ; xpos += dx
+ lea r5, [dstq+strideq*2]
+ vpblendvb m0, m7, m0, m1
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r5 +strideq*0], xm0
+ pextrd [r5 +strideq*1], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r3d, maxbased
+ jb .w4_loop
+ packuswb xm7, xm7
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r6 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+ALIGN function_align
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ movu xm2, [z_filter_s+6]
+ mova xm0, [tlq-1]
+ movd xm6, hd
+ vinserti128 m0, [tlq+7], 1
+ vpbroadcastb xm6, xm6
+ vbroadcasti128 m1, [z_upsample1]
+ pminub xm6, xm2
+ vpbroadcastd m7, [pb_36_m4]
+ vinserti128 m2, xm6, 1
+ add dxd, dxd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ movd xm6, dxd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r3d, dxd
+ psrldq m0, 1
+ lea r2, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp], m0
+.w8_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [rsp+r5], 1
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m1, [rsp+r5], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu xm2, [tlq]
+ pminub xm1, xm0, [base+z_filter_s+14]
+ vinserti128 m2, [tlq-1], 1
+ vinserti128 m1, [base+z_filter_s+ 0], 1
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pminub xm0, [base+z_filter_s+22]
+ vinserti128 m0, [base+z_filter_s+ 8], 1
+ pshufb m6, m2, m1
+ pmaddubsw m6, m7
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
+ movzx r3d, byte [tlq+15]
+ shufps m1, m0, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m1, m6
+ sub r5d, 3
+ jnz .w8_3tap
+ ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
+ ; which also results in an awkward edge case where out[w*2] is
+ ; slightly different from out[max_base_x] when h > w.
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+14]
+ pshufb m2, m0
+ pmaddubsw m2, m7
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
+ mov [rsp+16], r2b
+ paddw m1, m2
+.w8_3tap:
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 17 ; w*2 + (filter_strength == 3)
+ cmp hd, 16
+ cmovns maxbased, r5d
+ mov [tlq+r5], r3b
+ vextracti128 xm0, m1, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w8_main:
+ movd xm2, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastw m2, xm2
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ mov r3d, dxd
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0xf0
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ movu xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5], 1
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+ packuswb xm7, xm7
+.w8_end_loop:
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+ALIGN function_align
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+pb_12]
+ vbroadcasti128 m6, [base+z_filter_s+8]
+ vinserti128 m2, m6, [base+z_filter_s], 0
+ vinserti128 m6, [base+z_filter_s+16], 1
+ mova xm10, [tlq-1]
+ vinserti128 m10, [tlq+3], 1
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
+ vbroadcasti128 m7, [base+z_filter_s+14]
+ vinserti128 m8, m7, [base+z_filter_s+6], 0
+ vinserti128 m7, [base+z_filter_s+22], 1
+ psubw m0, m1
+ movu xm11, [tlq+12]
+ vinserti128 m11, [tlq+16], 1
+ pminub m8, m0
+ pminub m7, m0
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ movzx r3d, byte [tlq+31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .w16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3
+ mov [rsp+32], r2b
+ paddw m0, m10
+ paddw m1, m11
+.w16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 33
+ cmp hd, 32
+ cmovns maxbased, r5d
+ mov [tlq+r5], r3b
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [tlq], m0
+.w16_main:
+ movd xm6, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r3d, dxd
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r3+0]
+ movu xm1, [tlq+r3+8]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5+0], 1
+ vinserti128 m1, [tlq+r5+8], 1
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea r3d, [hq+31]
+ mov maxbased, 63
+ cmp hd, 32
+ cmovs maxbased, r3d
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vbroadcasti128 m0, [pb_0to15]
+ sub r3d, 29 ; h+2
+ movu xm13, [tlq+29] ; 32-39
+ movd xm1, r3d
+ movu xm14, [tlq+37] ; 40-47
+ sub r3d, 8 ; h-6
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ vpbroadcastb xm1, xm1
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movd xm2, r3d
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ pminub xm1, xm0 ; clip 32x8
+ mova m7, [z_filter_s+0]
+ pshufb xm13, xm1
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb xm2, xm2
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vpblendd m2, m1, 0xf0
+ vinserti128 m7, [z_filter_s+12], 0
+ pminub m2, m0 ; clip 32x16 and 32x(32|64)
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m14, m2
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m7
+ pmaddubsw m12, m9
+ movzx r3d, byte [tlq+63]
+ movzx r2d, byte [tlq+62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m7
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4] ; edge case for 32x64
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+64], r2b
+ mov tlq, rsp
+ mov [tlq+65], r3b
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w32_main:
+ movd xm6, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r5d, dxd
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu m0, [tlq+r3+0]
+ movu m1, [tlq+r3+8]
+ add r5d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+ test hb, 1
+ jz .w32_end_loop
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jz .w32_end
+.w32_end_loop:
+ mova [dstq+strideq*0], m7
+ mova [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ RET
+ALIGN function_align
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ mova m7, [z_filter_s+0]
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vinserti128 m7, [z_filter_s+12], 0
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm13, [tlq+29] ; 32-39
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ movu xm14, [tlq+37] ; 40-47
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m15, m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m10, [z_filter_k+4*2+12*2]
+ pshufb m11, m15
+ pmaddubsw m11, m10
+ pshufb m12, m7
+ pmaddubsw m12, m10
+ pshufb m13, m7
+ pmaddubsw m13, m10
+ pshufb m14, m7
+ pmaddubsw m14, m10
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq+ 61] ; 64- 71
+ vinserti128 m11, [tlq+ 75], 1 ; 80- 87
+ movu xm12, [tlq+ 69] ; 72- 79
+ vinserti128 m12, [tlq+ 83], 1 ; 88- 95
+ movu xm13, [tlq+ 93] ; 96-103
+ vinserti128 m13, [tlq+107], 1 ; 112-119
+ movu xm14, [tlq+101] ; 104-111
+ vinserti128 m14, [tlq+115], 1 ; 120-127
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea r3d, [hq-20]
+ mov tlq, rsp
+ packuswb m0, m2
+ packuswb m1, m6
+ vpbroadcastd xm2, [pb_14]
+ vbroadcasti128 m6, [pb_0to15]
+ mova [tlq+32*0], m0
+ mova [tlq+32*1], m1
+ movd xm0, r3d
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb m0, xm0
+ paddb m0, m2
+ pminub m0, m6 ; clip 64x16 and 64x32
+ pshufb m12, m0
+ pminub m1, m6 ; clip 64x64
+ pshufb m14, m1
+ pshufb m0, m11, m7
+ pmaddubsw m0, m10
+ pshufb m2, m12, m7
+ pmaddubsw m2, m10
+ pshufb m1, m13, m7
+ pmaddubsw m1, m10
+ pshufb m6, m14, m7
+ pmaddubsw m6, m10
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m0, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m2, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m1, m7
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m8
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+32*2], m0
+ mova [tlq+32*3], m1
+.w64_main:
+ movd xm12, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ lea r3d, [dxq-64]
+ shl maxbased, 6
+ vpbroadcastw m12, xm12
+ sub r3d, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ movd xm6, r3d
+ mov r5d, dxd
+ mova m10, [pb_1to32]
+ vpbroadcastd m11, [pb_32]
+ vpbroadcastw m6, xm6
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3+ 0]
+ movu m1, [tlq+r3+ 8]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ psraw m2, m6, 6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packsswb m2, m2
+ paddb m2, m10
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [dstq+ 0], m0
+ movu m0, [tlq+r3+32]
+ movu m1, [tlq+r3+40]
+ add r5d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ paddb m2, m11
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m6, m12
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [dstq+32], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+ 0], m7
+ mova [dstq+32], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mov r8, dxq
+ sub dxq, dyq
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_avx2_table
+ mova m2, [tlq-64]
+ mova m0, [tlq-32]
+ mova m1, [tlq]
+ and dyd, ~1
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m13, [base+pw_512]
+ vpbroadcastd m14, [base+pw_62]
+ vpbroadcastd m15, [base+pw_64]
+ mova [rsp+ 0], m2
+ mova [rsp+32], m0
+ mova [rsp+64], m1
+ neg dxd
+ neg dyd
+ jmp wq
+.w4:
+ vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6
+ vbroadcasti128 m10, [base+z1_shuf_w4]
+ vbroadcasti128 m11, [base+z2_shuf_h4]
+ lea r2d, [dxq+(65<<6)] ; xpos
+ movd xm5, dyd
+ mov r8d, (63-4)<<6
+ mov dyq, -4
+ pshuflw xm5, xm5, q0000
+ pmullw xm5, [base+z2_ymul]
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm3, [base+pb_4]
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_above: ; w4/w8
+ pshufb xm2, xm1, [base+z_upsample1-2]
+ pminub xm3, [base+z_filter_s+4]
+ vpbroadcastd xm4, [base+pb_36_m4]
+ vbroadcasti128 m10, [base+pb_0to15]
+ pshufb xm3, xm1, xm3
+ pmaddubsw xm2, xm4
+ pmaddubsw xm3, xm4
+ lea r2d, [r2+dxq+(1<<6)]
+ add dxd, dxd
+ paddw xm2, xm3
+ pmulhrsw xm2, xm13
+ sub r8d, 3<<6
+ paddw m6, m6
+ packuswb xm2, xm2
+ punpcklbw xm1, xm2
+ mova [rsp+gprsize+64], xm1
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mov r3d, hd
+ and r3d, 4
+ movd xm2, [rsp+gprsize+64]
+ movddup xm0, [rsp+gprsize+56]
+ movd xm1, r3d
+ palignr xm2, xm0, 1
+ vpbroadcastb xm1, xm1
+ pshufb xm2, [base+z_filter_s+18]
+ vpbroadcastd xm3, [base+pb_36_m4]
+ pmaxub xm1, [base+z_upsample1-2]
+ pshufb xm1, xm0, xm1
+ pmaddubsw xm2, xm3
+ pmaddubsw xm1, xm3
+ paddw xm5, xm5
+ add dyq, dyq
+ paddw xm1, xm2
+ pmulhrsw xm1, xm13
+ vbroadcasti128 m11, [base+z2_upsample]
+ paddw xm5, xm15
+ packuswb xm1, xm1
+ punpcklbw xm0, xm1
+ mova [rsp+gprsize+48], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm2, [base+pb_4]
+ pminub xm2, [base+z_filter_s]
+ vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ pshufb xm3, xm1, xm2 ; 00 01 12 23
+ pshufd xm2, xm2, q0321
+ pmaddubsw xm0, xm3, xm0
+ pshufb xm2, xm1, xm2 ; 12 23 34 44
+ pmaddubsw xm2, xm4
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
+ punpckhqdq xm3, xm3 ; 34 44 44 44
+ pmaddubsw xm3, xm4
+ movd xm4, r6m ; max_width
+ pminsw xm4, xm15
+ vpbroadcastb xm4, xm4
+ paddw xm0, xm2
+ paddw xm0, xm3
+ pmulhrsw xm0, xm13
+ psubb xm4, [base+pb_1to32]
+ psrlq xm1, 8
+ packuswb xm0, xm0
+ vpblendvb xm0, xm1, xm4
+ movd [rsp+65], xm0
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mov r5d, 10
+ cmp hd, 16
+ movu xm2, [rsp+49]
+ vinserti128 m2, [rsp+43], 1
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ vpbroadcastb m0, xm0
+ pmaxub m0, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pshufb m0, m2, m0
+ pmaddubsw m0, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1]
+ pshufb m1, m2, m1
+ pmaddubsw m1, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2]
+ pshufb m2, m4
+ pmaddubsw m2, m3
+ movd xm4, r7m ; max_height
+ pminsw xm4, xm15
+ vpbroadcastb xm4, xm4
+ psubb xm4, [base+pb_16to1]
+ paddw m1, m0
+ paddw m1, m2
+ pmulhrsw m1, m13
+ vextracti128 xm0, m1, 1
+ packuswb xm0, xm1
+ vpblendvb xm0, [rsp+48], xm4
+ mova [rsp+48], xm0
+ jmp .w4_main
+.w4_upsample_left:
+ call .upsample_left
+.w4_main:
+ movd xm0, dxd
+ mova m12, [base+z2_y_shuf_h4]
+ lea r5, [rsp+56] ; left-7
+ vpbroadcastw m0, xm0
+ lea r9, [strideq*3]
+ psraw xm1, xm5, 6
+ pand xm5, xm14 ; frac_y
+ pxor xm2, xm2
+ paddw m7, m0, m0
+ psubw xm4, xm2, xm1 ; base_y
+ vpblendd m0, m7, 0xcc
+ mova xm1, xm7
+ punpcklwd xm4, xm2
+ paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1
+ psubw xm1, xm15, xm5 ; 64-frac_y
+ psllw xm5, 8
+ paddw m7, m7
+ paddw m6, m0
+ por xm5, xm1 ; 64-frac_y, frac_y
+ vpbroadcastq m5, xm5
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ vpbroadcastq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vpbroadcastq m2, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq xm0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps xm0, [rsp+r3]
+ vpblendd m1, m2, 0xc0
+ pand m2, m14, m6 ; frac_x
+ vpblendd m0, m1, 0xf0
+ psubw m1, m15, m2 ; 64-frac_x
+ psllw m2, 8
+ pshufb m0, m10
+ por m1, m2 ; 64-frac_x, frac_x
+ pmaddubsw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ mova m1, m7 ; arbitrary negative value
+ vpgatherdq m3, [r5+xm4], m1
+ pshufb m1, m3, m11
+ vpermd m1, m12, m1
+ pmaddubsw m1, m5
+ psraw m2, m6, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w4_toponly:
+ pmulhrsw m0, m13
+ paddw m6, m7 ; xpos += dx
+ add r5, dyq
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r9 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r8d
+ jge .w4_loop
+.w4_leftonly_loop:
+ mova m1, m7
+ vpgatherdq m2, [r5+xm4], m1
+ add r5, dyq
+ pshufb m0, m2, m11
+ vpermd m0, m12, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m13
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r9 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6
+ movd xm5, dyd
+ vbroadcasti128 m10, [base+z_filter_s+2]
+ vbroadcasti128 m11, [base+z2_shuf_h4]
+ lea r2d, [dxq+(65<<6)] ; xpos
+ vpbroadcastw xm5, xm5
+ mov r8d, (63-8)<<6
+ mov dyq, -4
+ pmullw xm5, [base+z2_ymul]
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ vpbroadcastd xm3, [base+pb_8]
+ movhps [rsp+80], xm1
+ call .upsample_above
+ sub angled, 53 ; angle - 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm3, [base+pb_8]
+ pminub xm3, [base+z_filter_s+8]
+ vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
+ pmaddubsw xm0, xm2, xm0
+ pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88
+ shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88
+ pmaddubsw xm2, xm4
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
+ pmaddubsw xm3, xm4
+ movd xm4, r6m ; max_width
+ pminuw xm4, xm15
+ vpbroadcastb xm4, xm4
+ paddw xm0, xm2
+ paddw xm0, xm3
+ pmulhrsw xm0, xm13
+ psubb xm4, [base+pb_1to32]
+ psrldq xm1, 1
+ packuswb xm0, xm0
+ vpblendvb xm0, xm1, xm4
+ movq [rsp+65], xm0
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
+ cmp hd, 32
+ jne .w8_filter_left_h16
+ movu xm2, [rsp+27]
+ vinserti128 m2, [rsp+35], 1
+ vpbroadcastd xm0, [base+pb_5]
+ vbroadcasti128 m3, [base+z_filter_s+ 8]
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ pmaxub m3, m0
+ pshufb m3, m2, m3
+ pmaddubsw m3, m7
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ paddw m3, m1
+ paddw m3, m2
+ pmulhrsw m3, m13
+ jmp .w8_filter_left_top16
+.w8_filter_left_h16:
+ mov r5d, 10
+ cmp hd, 16
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vpbroadcastb m0, xm0
+.w8_filter_left_top16:
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ pmaxub m0, m2
+ movu xm2, [rsp+49]
+ vinserti128 m2, [rsp+43], 1
+ pshufb m0, m2, m0
+ pmaddubsw m0, m7
+ movd xm7, r7m ; max_height
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ pminsw xm7, xm15
+ paddw m1, m0
+ vpbroadcastb m7, xm7
+ paddw m1, m2
+ pmulhrsw m1, m13
+ psubb m7, [base+pb_32to1]
+ packuswb m3, m1
+ vpermq m3, m3, q1320
+ vpblendvb m3, [rsp+32], m7
+ mova [rsp+32], m3
+ jmp .w8_main
+.w8_upsample_left:
+ call .upsample_left
+.w8_main:
+ movd xm3, dxd
+ lea r5, [rsp+56] ; left-7
+ pshufd xm1, xm5, q3120
+ pand xm5, xm14
+ vpbroadcastw m3, xm3
+ pxor xm0, xm0
+ psubw xm2, xm15, xm5
+ psraw xm1, 6
+ lea r9, [strideq*3]
+ paddw m7, m3, m3
+ psubw xm9, xm0, xm1 ; base_y
+ psllw xm5, 8
+ punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5
+ vpblendd m3, m7, 0xf0 ; xpos0 xpos1
+ por xm5, xm2 ; 64-frac_y, frac_y
+ punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7
+ paddw m6, m3
+ vinserti128 m12, m5, xm5, 1
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3], 1
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movu xm1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m1, [rsp+r3], 1
+ pand m2, m14, m6
+ paddsw m4, m6, m7
+ psubw m5, m15, m2
+ psllw m2, 8
+ pshufb m0, m10
+ por m2, m5
+ pmaddubsw m0, m2
+ pand m2, m14, m4
+ psubw m5, m15, m2
+ psllw m2, 8
+ pshufb m1, m10
+ por m2, m5
+ pmaddubsw m1, m2
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m5, m7
+ vpgatherdq m3, [r5+xm9], m7
+ mova m7, m5
+ vpgatherdq m2, [r5+xm8], m5
+ pshufb m3, m11
+ pshufb m2, m11
+ punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3
+ vpermq m5, m5, q3120 ; y0 y1
+ vpermq m2, m2, q3120 ; y2 y3
+ pmaddubsw m5, m12
+ pmaddubsw m2, m12
+ psraw m6, 15 ; base_x < topleft
+ vpblendvb m0, m5, m6
+ psraw m3, m4, 15
+ vpblendvb m1, m2, m3
+.w8_toponly:
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ paddw m6, m4, m7 ; xpos += dx
+ add r5, dyq
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r9 ], xm1
+ sub hd, 4
+ jz .w8_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r8d
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m7
+ vpgatherdq m5, [r5+xm9], m7
+ mova m7, m0
+ vpgatherdq m3, [r5+xm8], m0
+ add r5, dyq
+ pshufb m2, m5, m11
+ pshufb m1, m3, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_leftonly_loop
+.w8_end:
+ RET
+.w16:
+ mov r8d, hd
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vbroadcasti128 m6, [tlq+1]
+ mova xm2, [base+z_filter_s]
+ vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de
+ movu xm3, [base+z_filter_s+8]
+ vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff
+ vpblendd m1, m6, 0xf0
+ vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2]
+ pshufb m2, m1, m2
+ pshufb m1, m3
+ pmaddubsw m0, m2, m0
+ shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m4
+ pmaddubsw m1, m5
+ movd xm4, r6m ; max_width
+ pminsw xm4, xm15
+ vpbroadcastb xm4, xm4
+ paddw m0, m2
+ paddw m0, m1
+ pmulhrsw m0, m13
+ psubb xm4, [base+pb_1to32]
+ vextracti128 xm2, m0, 1
+ packuswb xm0, xm2
+ vpblendvb xm0, xm6, xm4
+ movu [rsp+65], xm0
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w16_main
+ popcnt r3d, r3d
+ vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
+.w16_filter_left:
+ movd xm6, r7m ; max_height
+ pminsw xm6, xm15
+ vpbroadcastb m6, xm6
+ cmp hd, 32
+ jl .w16_filter_left_h16
+ vpbroadcastd xm0, [base+pb_5]
+ vbroadcasti128 m10, [base+z_filter_s+ 8]
+ vbroadcasti128 m11, [base+z_filter_s+12]
+ vbroadcasti128 m12, [base+z_filter_s+16]
+ je .w16_filter_left_h32
+ movu m3, [tlq-69]
+ movu m5, [tlq-61]
+ pmaxub m1, m10, m0
+ pshufb m1, m3, m1
+ pmaddubsw m1, m7
+ pshufb m2, m3, m11
+ pmaddubsw m2, m8
+ pshufb m3, m12
+ pmaddubsw m3, m9
+ paddw m1, m2
+ pshufb m2, m5, m10
+ pmaddubsw m2, m7
+ pshufb m4, m5, m11
+ pmaddubsw m4, m8
+ pshufb m5, m12
+ pmaddubsw m5, m9
+ paddw m1, m3
+ vpbroadcastd m3, [base+pb_32]
+ paddb m3, [base+pb_32to1]
+ paddw m2, m4
+ paddw m2, m5
+ pmulhrsw m1, m13
+ pmulhrsw m2, m13
+ psubb m3, m6, m3
+ packuswb m1, m2
+ vpblendvb m1, [tlq-64], m3
+ mova [rsp], m1
+ jmp .w16_filter_left_top32
+.w16_filter_left_h32:
+ pmaxub m10, m0
+.w16_filter_left_top32:
+ movu xm2, [tlq-37]
+ vinserti128 m2, [tlq-29], 1
+ pshufb m3, m2, m10
+ pshufb m1, m2, m11
+ pshufb m2, m12
+ pmaddubsw m3, m7
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ paddw m3, m1
+ paddw m3, m2
+ pmulhrsw m3, m13
+ jmp .w16_filter_left_top16
+.w16_filter_left_h16:
+ mov r5d, 10
+ cmp hd, 16
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vpbroadcastb m0, xm0
+.w16_filter_left_top16:
+ movu xm2, [tlq-15]
+ vinserti128 m2, [tlq-21], 1
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ pmaxub m0, m5
+ pshufb m0, m2, m0
+ pmaddubsw m0, m7
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ psubb m6, [base+pb_32to1]
+ paddw m1, m0
+ paddw m1, m2
+ pmulhrsw m1, m13
+ packuswb m3, m1
+ vpermq m3, m3, q1320
+ vpblendvb m3, [tlq-32], m6
+ mova [rsp+32], m3
+.w16_main:
+ movd xm1, dyd
+ vbroadcasti128 m10, [base+z_filter_s+2]
+ movd xm7, dxd
+ vbroadcasti128 m11, [base+z2_shuf_h2]
+ vpbroadcastw m1, xm1
+ vpbroadcastw m7, xm7
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul]
+ psllw xm1, 4
+ paddw m6, m7, [base+z2_base_inc]
+ lea r9d, [dxq+(65<<6)] ; xpos
+ movd [rsp+156], xm1
+.w16_loop0:
+ mov r2d, r9d
+ mova [rsp+160], m0
+ lea r5, [rsp+60] ; left-3
+ mova [rsp+192], m6
+ pxor m1, m1
+ psraw m2, m0, 6
+ pand m0, m14
+ psubw m9, m1, m2 ; base_y
+ psubw m12, m15, m0
+ punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11
+ psllw m0, 8
+ punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15
+ por m12, m0 ; 64-frac_y, frac_y
+.w16_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2]
+ vinserti128 m0, [rsp+r2+8], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm1, [rsp+r3]
+ vinserti128 m1, [rsp+r3+8], 1
+ pand m2, m14, m6
+ paddsw m5, m6, m7
+ psubw m3, m15, m2
+ psllw m2, 8
+ pshufb m0, m10
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m14, m5
+ psubw m3, m15, m2
+ psllw m2, 8
+ pshufb m1, m10
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 64
+ jge .w16_toponly
+ punpckhwd m2, m5, m5 ; mask out unnecessary loads
+ vpgatherdd m4, [r5+m9], m2
+ punpcklwd m2, m5, m5
+ vpgatherdd m3, [r5+m8], m2
+ pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1
+ pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1
+ punpcklqdq m2, m3, m4 ; y0
+ punpckhqdq m3, m4 ; y1
+ pmaddubsw m2, m12
+ pmaddubsw m3, m12
+ psraw m6, 15 ; base_x < topleft
+ vpblendvb m0, m2, m6
+ psraw m6, m5, 15
+ vpblendvb m1, m3, m6
+.w16_toponly:
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ paddw m6, m5, m7 ; xpos += dx
+ sub r5, 2
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-16)<<6
+ jge .w16_loop
+.w16_leftonly_loop:
+ mova m0, m7
+ vpgatherdd m4, [r5+m9], m7
+ mova m7, m0
+ vpgatherdd m3, [r5+m8], m0
+ sub r5, 2
+ pshufb m2, m4, m11
+ pshufb m1, m3, m11
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_leftonly_loop
+.w16_end:
+ sub r8d, 1<<8
+ jl .w16_ret
+ vpbroadcastd m0, [rsp+156]
+ paddw m0, [rsp+160] ; base_y += 16*dy
+ paddw m6, m13, [rsp+192]
+ add r7, 16
+ add r9d, 16<<6
+ movzx hd, r8b
+ mov dstq, r7
+ paddw m6, m13 ; base_x += 16*64
+ jmp .w16_loop0
+.w16_ret:
+ RET
+.w32:
+ mova m2, [tlq+32]
+ lea r8d, [hq+(1<<8)]
+ mova [rsp+96], m2
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastd m7, [base+z_filter_k+4*2+12*0]
+ vpbroadcastd m8, [base+z_filter_k+4*2+12*1]
+ vpbroadcastd m9, [base+z_filter_k+4*2+12*2]
+ mova xm5, [base+z_filter_s]
+ vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc
+ vinserti128 m1, [tlq+11], 1
+ movu xm6, [base+z_filter_s+12]
+ vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff
+ movu xm3, [tlq+ 6]
+ vinserti128 m3, [tlq+17], 1
+ movd xm0, r6m ; max_width
+ pminsw xm0, xm15
+ vpbroadcastb m10, xm0
+.w32_filter_above:
+ pshufb m0, m1, m5
+ shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de
+ pmaddubsw m0, m7
+ pshufb m2, m1, m4
+ shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m8
+ pshufb m1, m5
+ pmaddubsw m1, m9
+ paddw m0, m2
+ paddw m0, m1
+ pshufb m1, m3, m4
+ pmaddubsw m1, m7
+ pshufb m2, m3, m5
+ pmaddubsw m2, m8
+ pshufb m3, m6
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m1, m3
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ psubb m10, [base+pb_1to32]
+ packuswb m0, m1
+ vpblendvb m0, [tlq+1], m10
+ movu [rsp+65], m0
+ jmp .w16_filter_left
+.w64:
+ mova m2, [tlq+32]
+ mov r3d, [tlq+64]
+ lea r8d, [hq+(3<<8)]
+ mova [rsp+ 96], m2
+ mov [rsp+128], r3d
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastd m7, [base+z_filter_k+4*2+12*0]
+ vpbroadcastd m8, [base+z_filter_k+4*2+12*1]
+ vpbroadcastd m9, [base+z_filter_k+4*2+12*2]
+ movu xm6, [base+z_filter_s+ 4]
+ vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc
+ movu xm3, [tlq+30]
+ vinserti128 m3, [tlq+43], 1
+ movu xm5, [base+z_filter_s+16]
+ vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff
+ pshufb m0, m3, m6
+ shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de
+ pmaddubsw m0, m7
+ pshufb m2, m3, m4
+ shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m8
+ pshufb m3, m6
+ pmaddubsw m3, m9
+ paddw m0, m2
+ paddw m0, m3
+ movu xm2, [tlq+36]
+ vinserti128 m2, [tlq+49], 1
+ pshufb m4, m2, m4
+ pmaddubsw m4, m7
+ pshufb m3, m2, m6
+ pmaddubsw m3, m8
+ pshufb m2, m5
+ pmaddubsw m2, m9
+ movd xm5, r6m ; max_width
+ pminsw xm5, xm15
+ vpbroadcastb m10, xm5
+ paddw m3, m4
+ paddw m2, m3
+ vpbroadcastd m3, [base+pb_32]
+ pmulhrsw m0, m13
+ pmulhrsw m2, m13
+ mova xm5, [base+z_filter_s]
+ vinserti128 m5, [base+z_filter_s+6], 1
+ psubb m3, m10, m3
+ psubb m3, [base+pb_1to32]
+ vinserti128 m1, [tlq+13], 1
+ packuswb m0, m2
+ vpblendvb m0, [tlq+33], m3
+ movu xm3, [tlq+ 6]
+ vinserti128 m3, [tlq+19], 1
+ movu [rsp+97], m0
+ jmp .w32_filter_above
+
+cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ dec tlq
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ ALLOC_STACK -32, 9
+ movu xm8, [tlq-7]
+ pshufb xm0, xm8, [z_upsample1-4]
+ vpbroadcastb xm2, xm8
+ pshufb xm1, xm8, [z_filter_s+2]
+ mova [rsp+16], xm2 ; top[max_base_y]
+ vpbroadcastd xm2, [pb_36_m4]
+ add dyd, dyd
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ movd xm7, dyd
+ mov r2d, dyd
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2
+ punpcklbw xm1, xm8
+ mova xm8, [z_transpose4]
+ psllw m7, 2
+ pshufb xm1, [pb_15to0]
+ mova [rsp], xm1
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vpbroadcastq m1, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vpbroadcastq m2, [rsp+r4]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movq xm0, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movhps xm0, [rsp+r4]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m6, m7
+ pmulhrsw m0, m3
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ pshufb xm1, xm8
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r7 ], xm1, 3
+ add dstq, 4
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm2, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [base+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r4+angleq*8]
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ ret
+.h4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 12
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m7, [base+pb_7]
+ vbroadcasti128 m2, [tlq-14]
+ pmaxub m1, m7, [base+z_filter_s-4]
+ vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
+ pmaxub m7, [base+z_filter_s+4]
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r4d, 9
+ lea tlq, [rsp+15]
+ cmp wd, 4
+ cmovne maxbased, r4d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [rsp], xm0
+.h4_main:
+ movd xm6, dyd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ mov r4, tlq
+ sub tlq, 4
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63] ; ypos
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf_w4]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psrlw m7, 8 ; top[max_base_y]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_y
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1
+ paddw m10, m10
+ mova xm11, [z_transpose4]
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ vpbroadcastq m1, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ movq xm0, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_y
+ pmulhrsw m0, m3
+ paddw m6, m10 ; ypos += dy
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ pshufb xm1, xm11 ; transpose
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r7 ], xm1, 3
+ sub wd, 4
+ jz .h4_end
+ add dstq, 4
+ cmp r4d, maxbased
+ jg .h4_loop
+ packuswb xm7, xm7
+.h4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r7 ], xm7
+ add dstq, 4
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+ALIGN function_align
+.h8:
+ lea r4d, [angleq+216]
+ mov r4b, wb
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ and r4d, 4
+ mova xm0, [tlq-15]
+ vinserti128 m0, [tlq- 9], 1
+ movd xm1, r4d
+ movu xm2, [z_filter_s+2]
+ vinserti128 m2, [z_filter_s+6], 1
+ vpbroadcastb xm1, xm1 ; w & 4
+ vpbroadcastd m7, [pb_36_m4]
+ pmaxub xm1, [z_upsample1-4] ; clip 4x8
+ vinserti128 m1, [z_upsample1], 1
+ add dyd, dyd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ vinserti128 m0, [tlq-7], 1
+ movd xm6, dyd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r2d, dyd
+ lea r5, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ vbroadcasti128 m2, [pb_15to0]
+ packuswb m1, m1
+ punpcklbw m1, m0
+ pshufb m1, m2
+ vextracti128 [rsp+ 0], m1, 1
+ mova [rsp+16], xm1
+.h8_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base0
+ movu xm0, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base1
+ vinserti128 m0, [rsp+r4], 1
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base3
+ vinserti128 m1, [rsp+r4], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ lea r4, [dstq+strideq*4]
+ psllw m1, 8
+ por m0, m1
+ vextracti128 xm1, m0, 1
+ punpcklbw xm2, xm0, xm1
+ punpckhbw xm0, xm1
+ movd [dstq+strideq*0], xm2
+ pextrd [dstq+strideq*1], xm2, 1
+ pextrd [dstq+strideq*2], xm2, 2
+ pextrd [dstq+r5 ], xm2, 3
+ movd [r4 +strideq*0], xm0
+ pextrd [r4 +strideq*1], xm0, 1
+ pextrd [r4 +strideq*2], xm0, 2
+ pextrd [r4 +r5 ], xm0, 3
+ add dstq, 4
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd xm6, [base+pb_15]
+ pcmpeqb xm1, xm1
+ psubusb xm6, xm0
+ psubb xm6, xm1 ; w == 4 ? 5 : 1
+ movu xm2, [tlq-16]
+ pmaxub xm1, xm6, [base+z_filter_s]
+ vinserti128 m2, [tlq-14], 1
+ vinserti128 m1, [base+z_filter_s+12], 1
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmaxub xm6, [base+z_filter_s+ 8]
+ vinserti128 m6, [base+z_filter_s+20], 1
+ pshufb m0, m2, m1
+ pmaddubsw m0, m7
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
+ movzx r4d, byte [tlq-15]
+ shufps m1, m6, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m0, m1
+ sub r5d, 3
+ jnz .h8_3tap
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq-14]
+ pshufb m2, m6
+ pmaddubsw m2, m7
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+15], r2b
+ paddw m0, m2
+.h8_3tap:
+ pmulhrsw m0, m3
+ sar r5d, 1
+ lea tlq, [rsp+31]
+ add r5d, 17
+ cmp wd, 16
+ cmovns maxbased, r5d
+ neg r5
+ mov [tlq+r5], r4b
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq-15], xm0
+.h8_main:
+ movd xm2, dyd
+ vbroadcasti128 m0, [z_base_inc]
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m2, xm2
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0x0f
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ vbroadcasti128 m0, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6
+ vinserti128 m0, [tlq+r5], 0
+ sub rsp, 8*2
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ psllw xm0, 8
+ por xm0, xm1 ; interleave rows (partial transpose)
+ mova [rsp], xm0
+ sub wd, 2
+ jz .h8_transpose
+ cmp r4d, maxbased
+ jg .h8_loop
+ packuswb xm0, xm7, xm7
+.h8_end_loop:
+ sub rsp, 8*2
+ mova [rsp], xm0
+ sub wd, 2
+ jg .h8_end_loop
+.h8_transpose:
+ mova xm2, [rsp+16*1]
+ sub org_wd, 8
+ lea r2, [strideq*3]
+ lea r6, [dstq+org_wq]
+ cmovns dstq, r6
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ lea r6, [dstq+strideq*4]
+ jge .h8_w8
+ add rsp, 16*2
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r2 ], xm1, 3
+ movd [r6 +strideq*0], xm2
+ pextrd [r6 +strideq*1], xm2, 1
+ pextrd [r6 +strideq*2], xm2, 2
+ pextrd [r6 +r2 ], xm2, 3
+ jmp .h8_end
+.h8_w8_loop:
+ mova xm0, [rsp+16*0]
+ mova xm2, [rsp+16*1]
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+.h8_w8: ; w8/w16/w32
+ mova xm0, [rsp+16*2]
+ mova xm4, [rsp+16*3]
+ add rsp, 16*4
+ punpcklwd xm3, xm4, xm0
+ punpckhwd xm4, xm0
+ punpckldq xm0, xm3, xm1
+ punpckhdq xm3, xm1
+ punpckldq xm1, xm4, xm2
+ punpckhdq xm4, xm2
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+r2 ], xm3
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ movq [r6 +strideq*2], xm4
+ movhps [r6 +r2 ], xm4
+ sub dstq, 8
+ sub r6, 8
+ sub org_wd, 8
+ jge .h8_w8_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [wq+15]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m11, [base+pb_27]
+ vpbroadcastd m1, [base+pb_1]
+ vbroadcasti128 m6, [base+z_filter_s+12]
+ vinserti128 m2, m6, [base+z_filter_s+4], 0
+ vinserti128 m6, [base+z_filter_s+20], 1
+ movu xm10, [tlq-18]
+ vinserti128 m10, [tlq-14], 1
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
+ vbroadcasti128 m7, [base+z_filter_s+8]
+ vinserti128 m8, m7, [base+z_filter_s+0], 0
+ vinserti128 m7, [base+z_filter_s+16], 1
+ psubusb m11, m0
+ por m1, m11
+ movu xm11, [tlq-32]
+ vinserti128 m11, [tlq-28], 1
+ pmaxub m8, m1
+ pmaxub m7, m1
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ movzx r4d, byte [tlq-31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .h16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq-30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+31], r2b
+ paddw m0, m10
+ paddw m1, m11
+.h16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ lea tlq, [rsp+63]
+ add r5d, 33
+ cmp wd, 32
+ cmovns maxbased, r5d
+ neg r5
+ mov [tlq+r5], r4b
+ packuswb m0, m1
+ vpermq m0, m0, q2031
+ mova [tlq-31], m0
+.h16_main:
+ movd xm6, dyd
+ vbroadcasti128 m0, [z_base_inc]
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r4-0]
+ movu xm1, [tlq+r4-8]
+ lea r4, [r5+dyq]
+ sar r5, 6
+ vinserti128 m0, [tlq+r5-0], 1
+ vinserti128 m1, [tlq+r5-8], 1
+ sub rsp, 32
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ vpermq m0, m0, q3120
+ mova [rsp], m0
+ sub wd, 2
+ jz .h16_transpose
+ cmp r4d, maxbased
+ jg .h16_loop
+ mova m0, m7
+.h16_end_loop:
+ sub rsp, 32
+ mova [rsp], m7
+ sub wd, 2
+ jg .h16_end_loop
+.h16_transpose:
+ mova m2, [rsp+32*1]
+ sub org_wd, 8
+ lea r2, [strideq*3]
+ lea r6, [dstq+org_wq]
+ cmovns dstq, r6
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ lea r3, [strideq*5]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ lea r4, [strideq+r2*2] ; stride*7
+ jge .h16_w8
+ add rsp, 32*2
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ vextracti128 xm0, m0, 1
+ movd [dstq+strideq*4], xm1
+ pextrd [dstq+r3 ], xm1, 1
+ pextrd [dstq+r2*2 ], xm1, 2
+ pextrd [dstq+r4 ], xm1, 3
+ lea dstq, [dstq+strideq*8]
+ vextracti128 xm1, m1, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ movd [dstq+strideq*4], xm1
+ pextrd [dstq+r3 ], xm1, 1
+ pextrd [dstq+r2*2 ], xm1, 2
+ pextrd [dstq+r4 ], xm1, 3
+ jmp .h16_end
+.h16_w8_loop:
+ mova m0, [rsp+32*0]
+ mova m2, [rsp+32*1]
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+.h16_w8:
+ mova m2, [rsp+32*2]
+ mova m4, [rsp+32*3]
+ lea r6, [dstq+strideq*8]
+ add rsp, 32*4
+ punpcklbw m3, m4, m2
+ punpckhbw m4, m2
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ punpckldq m4, m2, m0
+ punpckhdq m2, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ movq [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm4
+ vextracti128 xm4, m4, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+r2 ], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*4], xm0
+ movhps [dstq+r3 ], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+r2*2 ], xm3
+ movhps [dstq+r4 ], xm3
+ vextracti128 xm3, m3, 1
+ movq [r6+strideq*0], xm4
+ movhps [r6+strideq*1], xm4
+ movq [r6+strideq*2], xm2
+ movhps [r6+r2 ], xm2
+ movq [r6+strideq*4], xm0
+ movhps [r6+r3 ], xm0
+ movq [r6+r2*2 ], xm3
+ movhps [r6+r4 ], xm3
+ sub dstq, 8
+ sub org_wd, 8
+ jge .h16_w8_loop
+.h16_end:
+ RET
+ALIGN function_align
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ vbroadcasti128 m0, [pb_0to15]
+ mov r4d, 21
+ mov r5d, 3
+ movu xm11, [tlq-66] ; 56-63
+ vinserti128 m11, [tlq-52], 1 ; 40-47
+ sub r4d, wd ; 21-w
+ cmovns r5d, r4d
+ movu xm12, [tlq-58] ; 48-55
+ vinserti128 m12, [tlq-44], 1 ; 32-39
+ sub r4d, 8 ; 13-w
+ movd xm1, r5d
+ movu xm13, [tlq-34] ; 24-31
+ vinserti128 m13, [tlq-20], 1 ; 8-15
+ movd xm2, r4d
+ vpbroadcastb m1, xm1
+ movu xm14, [tlq-28] ; 16-23
+ vinserti128 m14, [tlq-14], 1 ; 0- 7
+ vpbroadcastb m2, xm2
+ pmaxsb m1, m0 ; clip 16x32 and (32|64)x32
+ movu m7, [z_filter_s+4]
+ pshufb m11, m1
+ vinserti128 m8, m7, [z_filter_s+8], 1
+ vinserti128 m7, [z_filter_s+16], 0
+ pmaxsb m2, m0 ; clip 8x32
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m12, m2
+ pshufb m0, m11, m8
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ shufps m8, m7, q1021
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ shufps m8, m7, q2121
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ movzx r4d, byte [tlq-63]
+ movzx r2d, byte [tlq-62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4] ; edge case for 64x32
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+31], r2b
+ lea tlq, [rsp+95]
+ mov [tlq-65], r4b
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-63], m0
+ mova [tlq-31], m1
+.h32_main:
+ movd xm6, dyd
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r5- 0]
+ vinserti128 m0, [tlq+r5-16], 1
+ movu xm1, [tlq+r5- 8]
+ vinserti128 m1, [tlq+r5-24], 1
+ sub rsp, 32
+ add r4, dyq
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [rsp], m0
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea dstq, [dstq+org_wq-8]
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ lea r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+ mova m7, [rsp+32*0]
+ mova m6, [rsp+32*1]
+ mova m5, [rsp+32*2]
+ mova m4, [rsp+32*3]
+ mova m3, [rsp+32*4]
+ mova m2, [rsp+32*5]
+ mova m1, [rsp+32*6]
+ mova m0, [rsp+32*7]
+ lea r6, [dstq+strideq*8]
+ add rsp, 32*8
+ punpcklbw m8, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklwd m7, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpckldq m6, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m3
+ punpckhdq m8, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ movq [dstq+strideq*0], xm6
+ movhps [dstq+strideq*1], xm6
+ vextracti128 xm6, m6, 1
+ movq [dstq+strideq*2], xm7
+ movhps [dstq+r2 ], xm7
+ vextracti128 xm7, m7, 1
+ movq [dstq+strideq*4], xm2
+ movhps [dstq+r3 ], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+r2*2 ], xm8
+ movhps [dstq+r4 ], xm8
+ vextracti128 xm8, m8, 1
+ movq [r6+strideq*0], xm3
+ movhps [r6+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [r6+strideq*2], xm1
+ movhps [r6+r2 ], xm1
+ vextracti128 xm1, m1, 1
+ movq [r6+strideq*4], xm5
+ movhps [r6+r3 ], xm5
+ vextracti128 xm5, m5, 1
+ movq [r6+r2*2 ], xm0
+ movhps [r6+r4 ], xm0
+ lea r6, [r6+strideq*8]
+ vextracti128 xm0, m0, 1
+ movq [r6+strideq*0], xm6
+ movhps [r6+strideq*1], xm6
+ movq [r6+strideq*2], xm7
+ movhps [r6+r2 ], xm7
+ movq [r6+strideq*4], xm2
+ movhps [r6+r3 ], xm2
+ movq [r6+r2*2 ], xm8
+ movhps [r6+r4 ], xm8
+ lea r6, [r6+strideq*8]
+ movq [r6+strideq*0], xm3
+ movhps [r6+strideq*1], xm3
+ movq [r6+strideq*2], xm1
+ movhps [r6+r2 ], xm1
+ movq [r6+strideq*4], xm5
+ movhps [r6+r3 ], xm5
+ movq [r6+r2*2 ], xm0
+ movhps [r6+r4 ], xm0
+ sub dstq, 8
+ sub org_wd, 8
+ jg .h32_w8_loop
+ RET
+ALIGN function_align
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mov r4d, 21
+ vpbroadcastb xm11, [tlq-127]
+ vpblendd xm11, [tlq-130], 0x0e ; 120-127
+ sub r4d, wd ; 21-w
+ mov r5d, 3
+ vinserti128 m11, [tlq-116], 1 ; 104-111
+ movu m7, [z_filter_s+4]
+ cmp wd, 32
+ cmove r4d, r5d
+ vinserti128 m8, m7, [z_filter_s+8], 1
+ vbroadcasti128 m6, [pb_0to15]
+ movd xm1, r4d
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm12, [tlq-122] ; 112-119
+ vinserti128 m12, [tlq-108], 1 ; 96-103
+ vpbroadcastb m1, xm1
+ movu xm13, [tlq- 98] ; 88- 95
+ vinserti128 m13, [tlq- 84], 1 ; 72- 79
+ movu xm14, [tlq- 90] ; 80- 87
+ vinserti128 m14, [tlq- 76], 1 ; 64- 71
+ vinserti128 m7, [z_filter_s+16], 0
+ pshufb m0, m11, m8
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pmaxsb m1, m6 ; clip (16|32)x64
+ pshufb m13, m1
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ shufps m15, m8, m7, q1021
+ pshufb m10, m11, m15
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ shufps m10, m8, m7, q2132
+ pshufb m11, m10
+ pmaddubsw m11, m9
+ pshufb m12, m10
+ pmaddubsw m12, m9
+ pshufb m13, m10
+ pmaddubsw m13, m9
+ pshufb m14, m10
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq-66] ; 56-63
+ vinserti128 m11, [tlq-52], 1 ; 40-47
+ movu xm12, [tlq-58] ; 48-55
+ vinserti128 m12, [tlq-44], 1 ; 32-39
+ movu xm13, [tlq-34] ; 24-31
+ vinserti128 m13, [tlq-20], 1 ; 8-15
+ movu xm14, [tlq-28] ; 16-23
+ vinserti128 m14, [tlq-14], 1 ; 0- 7
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea tlq, [rsp+127]
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-127], m0
+ mova [tlq- 95], m1
+ pshufb m0, m11, m10
+ pmaddubsw m0, m9
+ pshufb m2, m12, m10
+ pmaddubsw m2, m9
+ pshufb m1, m13, m10
+ pmaddubsw m1, m9
+ pshufb m6, m14, m7
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ paddw m0, m7
+ pshufb m7, m12, m15
+ pmaddubsw m7, m9
+ paddw m2, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m1, m7
+ pshufb m7, m14, m10
+ pmaddubsw m7, m9
+ paddw m6, m7
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m15
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-63], m0
+ mova [tlq-31], m1
+.h64_main:
+ movd xm12, dyd
+ neg maxbaseq
+ vbroadcasti128 m8, [z3_shuf]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m12, xm12
+ lea r5d, [dyq+maxbaseq-64]
+ neg dyq
+ or maxbased, 63
+ lea r4, [dyq+63]
+ movd xm6, r5d
+ mova xm10, [pb_1to32+16]
+ vinserti128 m10, [pb_1to32], 1
+ vpbroadcastd m11, [pb_32]
+ vpbroadcastw m6, xm6
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m0, [tlq+r5-24]
+ movu m1, [tlq+r5-32]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ psraw m2, m6, 6
+ sub rsp, 64
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packsswb m2, m2
+ paddb m2, m10
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [rsp+32], m0
+ movu m0, [tlq+r5-56]
+ movu m1, [tlq+r5-64]
+ add r4, dyq
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ paddb m2, m11
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m6, m12
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [rsp], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+32], m7
+ mova [rsp+ 0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ imul r5, strideq, -8
+ lea dstq, [dstq+org_wq-16]
+ lea r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+ lea r6, [rsp+16*3]
+.h64_transpose_loop:
+ mova xm0, [r6+64*15]
+ vinserti128 m0, [r6+64* 7], 1
+ mova xm1, [r6+64*14]
+ vinserti128 m1, [r6+64* 6], 1
+ mova xm2, [r6+64*13]
+ vinserti128 m2, [r6+64* 5], 1
+ mova xm3, [r6+64*12]
+ vinserti128 m3, [r6+64* 4], 1
+ mova xm4, [r6+64*11]
+ vinserti128 m4, [r6+64* 3], 1
+ mova xm5, [r6+64*10]
+ vinserti128 m5, [r6+64* 2], 1
+ mova xm6, [r6+64* 9]
+ vinserti128 m6, [r6+64* 1], 1
+ mova xm7, [r6+64* 8]
+ vinserti128 m7, [r6+64* 0], 1
+ sub r6, 16
+ punpcklbw m8, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklwd m7, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpckldq m6, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m3
+ punpckhdq m8, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ vpermq m6, m6, q3120
+ vpermq m7, m7, q3120
+ vpermq m2, m2, q3120
+ vpermq m8, m8, q3120
+ vpermq m3, m3, q3120
+ vpermq m1, m1, q3120
+ vpermq m5, m5, q3120
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm6
+ vextracti128 [dstq+strideq*1], m6, 1
+ mova [dstq+strideq*2], xm7
+ vextracti128 [dstq+r2 ], m7, 1
+ mova [dstq+strideq*4], xm2
+ vextracti128 [dstq+r3 ], m2, 1
+ mova [dstq+r2*2 ], xm8
+ vextracti128 [dstq+r4 ], m8, 1
+ sub dstq, r5
+ mova [dstq+strideq*0], xm3
+ vextracti128 [dstq+strideq*1], m3, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r2 ], m1, 1
+ mova [dstq+strideq*4], xm5
+ vextracti128 [dstq+r3 ], m5, 1
+ mova [dstq+r2*2 ], xm0
+ vextracti128 [dstq+r4 ], m0, 1
+ sub dstq, r5
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 64*16
+ lea dstq, [dstq+r5*8-16]
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_XMM 4 ; dst, src, tmp, shuf
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ pshufd xm%1, xm%2, q0000 ; p0 p1
+ pmaddubsw xm%1, xm2
+ pshufd xm%3, xm%2, q1111 ; p2 p3
+ pmaddubsw xm%3, xm3
+ paddw xm%1, xm1
+ paddw xm%1, xm%3
+ pshufd xm%3, xm%2, q2222 ; p4 p5
+ pmaddubsw xm%3, xm4
+ paddw xm%1, xm%3
+ pshufd xm%3, xm%2, q3333 ; p6 __
+ pmaddubsw xm%3, xm5
+ paddw xm%1, xm%3
+ psraw xm%1, 4
+ packuswb xm%1, xm%1
+%endmacro
+
+%macro FILTER_YMM 4 ; dst, src, tmp, shuf
+ pshufb m%2, m%4
+ pshufd m%1, m%2, q0000
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddubsw m%3, m3
+ paddw m%1, m1
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ vperm2i128 m%3, m%1, m%1, 0x01
+ packuswb m%1, m%3
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
+%define base r6-ipred_filter_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_avx2_table]
+ movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pw_8]
+ vbroadcasti128 m2, [filterq+16*0]
+ vbroadcasti128 m3, [filterq+16*1]
+ vbroadcasti128 m4, [filterq+16*2]
+ vbroadcasti128 m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 9
+ mova xm8, [base+filter_shuf2]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrd xm0, xm6, [tlq+hq], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_XMM 6, 0, 7, 8
+ movd [dstq+strideq*0], xm6
+ pextrd [dstq+strideq*1], xm6, 1
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 10
+ mova m8, [base+filter_shuf1]
+ FILTER_XMM 7, 0, 6, [base+filter_shuf2]
+ vpbroadcastd m0, [tlq+4]
+ vpbroadcastd m6, [tlq+5]
+ sub tlq, 4
+ sub tlq, hq
+ vpbroadcastq m7, xm7
+ vpblendd m7, m6, 0x20
+.w8_loop:
+ vpbroadcastd xm6, [tlq+hq]
+ palignr m6, m0, 12
+ vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm6, xm7
+ call .main
+ vpblendd xm6, xm7, 0x0c
+ pshufd xm6, xm6, q3120
+ movq [dstq+strideq*0], xm6
+ movhps [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign xmm_regs_used 15
+ %assign stack_size_padded 0x98
+ SUB rsp, stack_size_padded
+%endif
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+%if WIN64
+ movaps [rsp+0xa8], xmm6
+ movaps [rsp+0xb8], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+%endif
+ FILTER_XMM 12, 0, 7, [base+filter_shuf2]
+ vpbroadcastd m0, [tlq+5]
+ vpblendd m0, [tlq-12], 0x14
+ mova m8, [base+filter_shuf1]
+ vpbroadcastq m7, xm12
+ vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ call .main ; c0 d0 a1 b1 a1 b1 c0 d0
+ movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ vinserti128 m14, m8, [base+filter_shuf3], 0
+ vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1
+ FILTER_XMM 6, 9, 10, 14
+ vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2
+ vpbroadcastd m9, [tlq+13]
+ vpbroadcastd m10, [tlq+12]
+ psrld m11, m8, 4
+ vpblendd m6, m9, 0x20 ; top
+ sub tlq, 6
+ sub tlq, hq
+.w16_loop:
+ vpbroadcastd xm9, [tlq+hq]
+ palignr m9, m0, 12
+ vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm13, xm7
+ call .main ; e0 f0 c1 d1 c1 d1 e0 f0
+ vpblendd m9, m12, m10, 0xf0
+ vpblendd m12, m6, 0xc0
+ pshufd m9, m9, q3333
+ vpblendd m9, m6, 0xee
+ vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2
+ vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2
+ vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1
+ mova [dstq+strideq*0], xm9
+ vextracti128 [dstq+strideq*1], m9, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+ pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_XMM 0, 7, 9, [base+filter_shuf1+16]
+ vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3
+ shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+ shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm6
+ ret
+ALIGN function_align
+.w32:
+ sub rsp, stack_size_padded
+ sub hd, 2
+ lea r3, [dstq+16]
+ lea r5d, [hq-2]
+ call .w16_main
+ add tlq, r5
+ mov dstq, r3
+ lea r3, [strideq-4]
+ lea r4, [r3+strideq*2]
+ movq xm0, [tlq+21]
+ pinsrd xm0, [dstq-4], 2
+ pinsrd xm0, [dstq+r3*1], 3
+ FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0
+ movq xm7, [dstq+r3*2]
+ pinsrd xm7, [dstq+r4], 2
+ palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
+ vpbroadcastd m0, [tlq+28]
+ vpbroadcastd m9, [tlq+29]
+ vbroadcasti128 m8, [base+filter_shuf1+16]
+ vpblendd m0, m9, 0x20
+ vpblendd m0, m7, 0x0f
+ vpbroadcastq m7, xm12
+ vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ call .main ; c0 d0 a1 b1 a1 b1 c0 d0
+ add r3, 2
+ lea r4, [r4+strideq*2]
+ movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1
+ FILTER_XMM 6, 9, 10, 14
+ vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2
+ vpbroadcastd m9, [tlq+37]
+ vpbroadcastd m10, [tlq+36]
+ vpblendd m6, m9, 0x20 ; top
+.w32_loop:
+ movq xm9, [dstq+r3*4]
+ pinsrd xm9, [dstq+r4], 2
+.w32_loop_last:
+ palignr m9, m0, 12
+ vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm13, xm7 ; c0 d0
+ call .main ; e0 f0 c1 d1 c1 d1 e0 f0
+ vpblendd m9, m12, m10, 0xf0
+ vpblendd m12, m6, 0xc0
+ pshufd m9, m9, q3333
+ vpblendd m9, m6, 0xee
+ vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2
+ vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2
+ vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1
+ mova [dstq+strideq*0], xm9
+ vextracti128 [dstq+strideq*1], m9, 1
+ lea dstq, [dstq+strideq*2]
+ sub r5d, 2
+ jg .w32_loop
+ jz .w32_loop_last
+ vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+ pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_XMM 0, 7, 9, [base+filter_shuf1+16]
+ vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3
+ shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+ shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm6
+ RET
+ALIGN function_align
+.main:
+ FILTER_YMM 7, 0, 9, 8
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ lea t0, [ipred_cfl_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ shrx t0d, t0d, r6d
+ movd xm3, t0d
+ lea t0, [ipred_cfl_left_avx2_table]
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+r6 ], xm5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+ movhps [dstq+strideq*2], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vpermq m4, m4, q3120
+ mova [dstq+strideq*0], xm4
+ vextracti128 [dstq+strideq*1], m4, 1
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vpermq m4, m4, q3120
+ mova [dstq], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ lea t0, [ipred_cfl_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ mov ac_bakq, acq
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ vpbroadcastd m2, [pb_2]
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq xm0, [yq]
+ movq xm1, [yq+strideq]
+ movhps xm0, [yq+strideq*2]
+ movhps xm1, [yq+stride3q]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ paddw xm0, xm1
+ mova [acq], xm0
+ paddw xm4, xm0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ vpermq m0, m0, q1111
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova xm0, [yq]
+ mova xm1, [yq+strideq]
+ vinserti128 m0, [yq+strideq*2], 1
+ vinserti128 m1, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w8_hpad
+.w8_wpad:
+ vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+ movq xm0, [yq]
+ movq xm1, [yq+strideq]
+ vinserti128 m0, [yq+strideq*2], 1
+ vinserti128 m1, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufb m0, m3
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w8_hpad:
+ vpermq m0, m0, q3232
+.w8_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad_loop
+ jmp .calc_avg
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad_loop
+.w16_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_420_avx2_table]
+ shl wpadd, 2
+ mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+ ipred_cfl_ac_420_avx2_table+wpadq*8-32]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w16_pad3:
+ vpbroadcastq m0, [yq]
+ vpbroadcastq m1, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad2:
+ vbroadcasti128 m0, [yq]
+ vbroadcasti128 m1, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ ; fall-through
+.w16_wpad_end:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufb m0, m3
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jz .w16_wpad_done
+ jmp iptrq
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ ; fall-through
+
+.calc_avg:
+ vpbroadcastd m2, [pw_1]
+ pmaddwd m0, m4, m2
+ vextracti128 xm1, m0, 1
+ tzcnt r1d, szd
+ paddd xm0, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ mov ac_bakq, acq
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ vpbroadcastd m2, [pb_4]
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq xm1, [yq]
+ movhps xm1, [yq+strideq]
+ movq xm0, [yq+strideq*2]
+ movhps xm0, [yq+stride3q]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ mova [acq], xm1
+ mova [acq+16], xm0
+ paddw xm4, xm0
+ paddw xm5, xm1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ vpermq m0, m0, q1111
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova xm1, [yq]
+ vinserti128 m1, [yq+strideq], 1
+ mova xm0, [yq+strideq*2]
+ vinserti128 m0, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w8_hpad
+.w8_wpad:
+ vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+ movq xm1, [yq]
+ vinserti128 m1, [yq+strideq], 1
+ movq xm0, [yq+strideq*2]
+ vinserti128 m0, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pshufb m0, m3
+ pshufb m1, m3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w8_hpad:
+ vpermq m0, m0, q3232
+.w8_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad_loop
+ jmp .calc_avg
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad_loop
+.w16_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_422_avx2_table]
+ shl wpadd, 2
+ mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+ ipred_cfl_ac_422_avx2_table+wpadq*8-32]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w16_pad3:
+ vpbroadcastq m1, [yq]
+ vpbroadcastq m0, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad2:
+ vbroadcasti128 m1, [yq]
+ vbroadcasti128 m0, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad1:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ ; fall-through
+.w16_wpad_end:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pshufb m0, m3
+ pshufb m1, m3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jz .w16_wpad_done
+ jmp iptrq
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ ; fall-through
+
+.calc_avg:
+ vpbroadcastd m2, [pw_1]
+ pmaddwd m5, m5, m2
+ pmaddwd m0, m4, m2
+ paddd m0, m5
+ vextracti128 xm1, m0, 1
+ tzcnt r1d, szd
+ paddd xm0, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ vpbroadcastd m5, [pw_1]
+ tzcnt r8d, wd
+ lea r5, [ipred_cfl_ac_444_avx2_table]
+ movsxd r8, [r5+r8*4+12]
+ add r5, r8
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+ mov ac_bakq, acq
+ jmp r5
+
+.w4:
+ lea stride3q, [strideq*3]
+ pxor xm2, xm2
+.w4_loop:
+ movd xm1, [yq]
+ movd xm0, [yq+strideq*2]
+ pinsrd xm1, [yq+strideq], 1
+ pinsrd xm0, [yq+stride3q], 1
+ punpcklbw xm1, xm2
+ punpcklbw xm0, xm2
+ psllw xm1, 3
+ psllw xm0, 3
+ mova [acq], xm1
+ mova [acq+16], xm0
+ paddw xm1, xm0
+ paddw xm4, xm1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_mul
+ pshufd xm0, xm0, q3232
+ paddw xm1, xm0, xm0
+.w4_hpad_loop:
+ mova [acq], xm0
+ mova [acq+16], xm0
+ paddw xm4, xm1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg_mul
+
+.w8:
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+.w8_loop:
+ movq xm1, [yq]
+ movq xm0, [yq+strideq*2]
+ vinserti128 m1, [yq+strideq], 1
+ vinserti128 m0, [yq+stride3q], 1
+ punpcklbw m1, m2
+ punpcklbw m0, m2
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_mul
+ vpermq m0, m0, q3232
+ paddw m1, m0, m0
+.w8_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddw m4, m1
+ add acq, 64
+ sub hpadd, 4
+ jg .w8_hpad_loop
+ jmp .calc_avg_mul
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmovzxbw m1, [yq]
+ pmovzxbw m0, [yq+strideq]
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ pmaddwd m1, m5
+ paddd m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad
+.w16_wpad:
+ mova m3, [cfl_ac_444_w16_pad1_shuffle]
+.w16_wpad_loop:
+ vpbroadcastq m1, [yq]
+ vpbroadcastq m0, [yq+strideq]
+ pshufb m1, m3
+ pshufb m0, m3
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ pmaddwd m1, m5
+ paddd m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad:
+ paddw m1, m0, m0
+ pmaddwd m1, m5
+.w16_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddd m4, m1
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg
+
+.w32:
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ pmovzxbw m1, [yq]
+ pmovzxbw m0, [yq+16]
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m2, m1, m0
+ pmaddwd m2, m5
+ paddd m4, m2
+ add yq, strideq
+ add acq, 64
+ dec hd
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w32_hpad_loop
+.w32_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_444_avx2_table]
+ add wpadd, wpadd
+ mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w32_pad3:
+ vpbroadcastq m1, [yq]
+ pshufb m1, m3
+ vpermq m0, m1, q3232
+ jmp .w32_wpad_end
+.w32_pad2:
+ pmovzxbw m1, [yq]
+ pshufhw m0, m1, q3333
+ vpermq m0, m0, q3333
+ jmp .w32_wpad_end
+.w32_pad1:
+ pmovzxbw m1, [yq]
+ vpbroadcastq m0, [yq+16]
+ pshufb m0, m3
+ ; fall-through
+.w32_wpad_end:
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m2, m1, m0
+ pmaddwd m2, m5
+ paddd m4, m2
+ add yq, strideq
+ add acq, 64
+ dec hd
+ jz .w32_wpad_done
+ jmp iptrq
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+32], m0
+ paddd m4, m2
+ add acq, 64
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp .calc_avg
+
+.calc_avg_mul:
+ pmaddwd m4, m5
+.calc_avg:
+ vextracti128 xm1, m4, 1
+ tzcnt r1d, szd
+ paddd xm0, xm4, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vbroadcasti128 m4, [palq]
+ lea r2, [pal_pred_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ packuswb m4, m4
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ pshufb xm0, xm4, [idxq]
+ add idxq, 16
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ pshufb xm0, xm4, [idxq+16*0]
+ pshufb xm1, xm4, [idxq+16*1]
+ add idxq, 16*2
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ pshufb m0, m4, [idxq+32*0]
+ pshufb m1, m4, [idxq+32*1]
+ add idxq, 32*2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r2 ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ pshufb m0, m4, [idxq+32*0]
+ pshufb m1, m4, [idxq+32*1]
+ pshufb m2, m4, [idxq+32*2]
+ pshufb m3, m4, [idxq+32*3]
+ add idxq, 32*4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r2 ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+ALIGN function_align
+.w64:
+ pshufb m0, m4, [idxq+32*0]
+ pshufb m1, m4, [idxq+32*1]
+ pshufb m2, m4, [idxq+32*2]
+ pshufb m3, m4, [idxq+32*3]
+ add idxq, 32*4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred_avx512.asm b/third_party/dav1d/src/x86/ipred_avx512.asm
new file mode 100644
index 0000000000..38c86b54f5
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_avx512.asm
@@ -0,0 +1,1432 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __
+filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10
+ db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6
+ db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0
+ db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0
+ db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
+ db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
+ db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0
+ db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0
+ db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8
+ db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4
+ db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0
+ db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0
+ db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8
+ db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4
+ db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0
+ db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0
+ db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14
+ db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12
+ db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0
+ db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0
+filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31
+ db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131
+ db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147
+ db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163
+filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31
+smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9
+ db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13
+ db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11
+ db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15
+smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
+ db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95
+ db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111
+ db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
+ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
+ db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+
+pb_127_m127: times 2 db 127, -127
+pb_128: times 4 db 128
+pw_128: times 2 dw 128
+pw_255: times 2 dw 255
+
+%define pb_1 (ipred_h_shuf+24)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+16)
+%define pd_8 (filter_taps+128)
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4)
+
+JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
+JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_8bpc_avx512icl_table]
+ movd xm0, wm
+ tzcnt wd, wm
+ inc tlq
+ movifnidn hd, hm
+ movu ym1, [tlq]
+ movd xmm3, wd
+ movsxd r6, [r5+wq*4]
+ vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ vpdpbusd ym0, ym1, ym2
+ add r6, r5
+ add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_left_8bpc_avx512icl_table]
+ mov hd, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movd xm0, hm
+ movu ym1, [tlq]
+ movd xmm3, r6d
+ movsxd r6, [r5+r6*4]
+ vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ vpdpbusd ym0, ym1, ym2
+ add r6, r5
+ add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu ym1, [tlq+32] ; unaligned when jumping here from dc_top
+ vpdpbusd ym0, ym1, ym2
+.h32:
+ vextracti32x4 xm1, ym0, 1
+ paddd xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+.h4:
+ vpsrlvd xm0, xmm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ jmp wq
+
+cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm0, r5d
+ tzcnt r5d, r5d
+ movd xmm4, r5d
+ lea r5, [ipred_dc_8bpc_avx512icl_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xmm1, [tlq-4]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w4:
+ movd xmm1, [tlq+1]
+ vpdpbusd xm0, xmm1, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xmm0, xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xmm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddd xmm1, xm0
+ shrx r6d, r6d, r2d
+ psrlq xmm0, xmm1, 32
+ paddd xmm0, xmm1
+ movd xmm1, r6d
+ psrld xmm0, 2
+ pmulhuw xmm0, xmm1
+.w4_end:
+ vpbroadcastb xm0, xmm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ movq xmm1, [tlq-8]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w8:
+ movq xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+ paddd xmm2, xm2, xm0
+ punpckhqdq xmm0, xmm2, xmm2
+ paddd xmm0, xmm2
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w8_end:
+ vpbroadcastb xm0, xmm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova xmm1, [tlq-16]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w16:
+ movu xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+ paddd xmm2, xm2, xm0
+ punpckhqdq xmm0, xmm2, xmm2
+ paddd xmm0, xmm2
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w16_end:
+ vpbroadcastb xm0, xmm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova ym1, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ jmp wq
+.w32:
+ movu ym1, [tlq+1]
+ vpdpbusd ym0, ym1, ym3
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
+ punpckhqdq xmm0, xmm1, xmm1
+ paddd xmm0, xmm1
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w32_end:
+ vpbroadcastb ym0, xmm0
+.s32:
+ mova [dstq+strideq*0], ym0
+ mova [dstq+strideq*1], ym0
+ mova [dstq+strideq*2], ym0
+ mova [dstq+stride3q ], ym0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+.h64:
+ mova ym1, [tlq-64]
+ mova ym2, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ jmp wq
+.w64:
+ movu ym1, [tlq+ 1]
+ movu ym2, [tlq+33]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
+ punpckhqdq xmm0, xmm1, xmm1
+ paddd xmm0, xmm1
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w64_end:
+ vpbroadcastb m0, xmm0
+.s64:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movu m0, [tlq+1]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3
+%define base r6-ipred_h_8bpc_avx512icl_table
+ lea r6, [ipred_h_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea stride3q, [strideq*3]
+ sub tlq, hq
+ add wq, r6
+ jmp wq
+.w4:
+ mova xmm1, [base+ipred_h_shuf+16]
+.w4_loop:
+ movd xmm0, [tlq+hq-4]
+ pshufb xmm0, xmm1
+ movd [dstq+strideq*0], xmm0
+ pextrd [dstq+strideq*1], xmm0, 1
+ pextrd [dstq+strideq*2], xmm0, 2
+ pextrd [dstq+stride3q ], xmm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+ movsldup xmm2, [base+ipred_h_shuf+16]
+ movshdup xmm3, [base+ipred_h_shuf+16]
+.w8_loop:
+ movd xmm1, [tlq+hq-4]
+ pshufb xmm0, xmm1, xmm2
+ pshufb xmm1, xmm3
+ movq [dstq+strideq*0], xmm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm0
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m1, [base+smooth_shuf]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ pshufb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpbroadcastd ym3, [base+pb_1]
+ vpord m2, m3, [base+pb_2] {1to16}
+.w32_loop:
+ vpbroadcastd m1, [tlq+hq-4]
+ pshufb m0, m1, m2
+ pshufb m1, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64:
+ vpbroadcastd m4, [base+pb_3]
+ vpbroadcastd m5, [base+pb_2]
+ vpbroadcastd m6, [base+pb_1]
+ pxor m7, m7
+.w64_loop:
+ vpbroadcastd m3, [tlq+hq-4]
+ pshufb m0, m3, m4
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64_loop
+ RET
+
+%macro PAETH 0
+ psubusb m1, m5, m4
+ psubusb m0, m4, m5
+ por m1, m0 ; tdiff
+ pavgb m2, m6, m4
+ vpcmpub k1, m1, m7, 1 ; tdiff < ldiff
+ vpblendmb m0{k1}, m4, m6
+ vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8
+ psubusb m3, m5, m2
+ psubb m2, m4
+ psubusb m2, m5
+ por m2, m3
+ pminub m1, m7
+ paddusb m2, m2
+ por m2, m4 ; min(tldiff, 255)
+ vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff
+ vmovdqu8 m0{k1}, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3
+ lea r6, [ipred_paeth_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1]
+ lea topq, [tlq+1]
+ sub tlq, hq
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_YMM avx512icl
+.w4:
+ vpbroadcastd m6, [topq]
+ mova m9, [ipred_h_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ vpbroadcastq m4, [tlq+hq-8]
+ pshufb m4, m9 ; left
+ PAETH
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
+ sub hd, 8
+ jl .w4_ret
+ vextracti32x4 xm0, m0, 1
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_ret:
+ RET
+INIT_ZMM avx512icl
+.w8:
+ vpbroadcastq m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ vpbroadcastq m4, [tlq+hq-8]
+ pshufb m4, m9
+ PAETH
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ sub hd, 8
+ jl .w8_ret
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jg .w8_loop
+.w8_ret:
+ RET
+.w16:
+ vbroadcasti32x4 m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ vpbroadcastd m4, [tlq+hq-4]
+ pshufb m4, m9
+ PAETH
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m6, [topq]
+ mova ym9, ym8
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ vpbroadcastd m4, [tlq+hq-2]
+ pshufb m4, m9
+ PAETH
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ movu m6, [topq]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w64_loop:
+ vpbroadcastb m4, [tlq+hq-1]
+ PAETH
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-ipred_smooth_v_8bpc_avx512icl_table
+ lea r6, [ipred_smooth_v_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m0, [base+pb_127_m127]
+ vpbroadcastd m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ vpbroadcastb m4, [tlq+hq] ; bottom
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastd m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+ punpcklbw m2, m4 ; top, bottom
+ pmaddubsw m3, m2, m0
+ paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+ vbroadcasti32x4 m0, [weightsq+hq*2]
+ pshufb m0, m5
+ pmaddubsw m0, m2, m0
+ paddw m0, m3
+ vpermb m0, m6, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ add hq, 8
+ jg .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vpbroadcastq m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+ punpcklbw m2, m4
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m5
+ pmaddubsw m0, m2, m0
+ paddw m0, m3
+ vpermb m0, m6, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w16_loop:
+ vpbroadcastq m1, [weightsq+hq*2]
+ pshufb m1, m6
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m7, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w32_loop:
+ vpbroadcastd m1, [weightsq+hq*2]
+ pshufb m1, m6
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m7, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ movu m3, [tlq+1]
+ mova m6, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2]
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m6, m1
+ mova [dstq], m0
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
+%define base r5-ipred_smooth_h_8bpc_avx512icl_table
+ lea r5, [ipred_smooth_h_8bpc_avx512icl_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ vpbroadcastb m4, [tlq+r6] ; right
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pb_127_m127]
+ vpbroadcastd m6, [base+pw_128]
+ sub tlq, hq
+ add wq, r5
+ vpmovb2m k1, m6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movsldup m3, [smooth_shuf]
+ vpbroadcastq m7, [smooth_weights+4*2]
+ mova ym8, [smooth_endA]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8]
+ mova m2, m4
+ vpshufb m2{k1}, m0, m3 ; left, right
+ pmaddubsw m0, m2, m5
+ pmaddubsw m1, m2, m7
+ paddw m2, m6
+ paddw m0, m2
+ paddw m0, m1
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ sub hd, 8
+ jl .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.ret:
+ RET
+.w8:
+ movsldup m3, [smooth_shuf]
+ vbroadcasti32x4 m7, [smooth_weights+8*2]
+ mova ym8, [smooth_endA]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ mova m2, m4
+ vpshufb m2{k1}, m0, m3
+ pmaddubsw m0, m2, m5
+ pmaddubsw m1, m2, m7
+ paddw m2, m6
+ paddw m0, m2
+ paddw m0, m1
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m7, [smooth_shuf]
+ vbroadcasti32x4 m8, [smooth_weights+16*2]
+ vbroadcasti32x4 m9, [smooth_weights+16*3]
+ mova m10, [smooth_endB]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ mova m3, m4
+ vpshufb m3{k1}, m0, m7
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m8
+ pmaddubsw m1, m3, m9
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m10, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ mova m10, [smooth_endA]
+ vpbroadcastd ym7, [pb_1]
+ vbroadcasti32x8 m8, [smooth_weights+32*2]
+ vbroadcasti32x8 m9, [smooth_weights+32*3]
+ vshufi32x4 m10, m10, q3120
+.w32_loop:
+ vpbroadcastd m0, [tlq+hq-2]
+ mova m3, m4
+ vpshufb m3{k1}, m0, m7
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m8
+ pmaddubsw m1, m3, m9
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m10, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ mova m7, [smooth_weights+64*2]
+ mova m8, [smooth_weights+64*3]
+ mova m9, [smooth_endA]
+.w64_loop:
+ mova m3, m4
+ vpbroadcastb m3{k1}, [tlq+hq-1]
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m7
+ pmaddubsw m1, m3, m8
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m9, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+%define base r5-ipred_smooth_8bpc_avx512icl_table
+ lea r5, [ipred_smooth_8bpc_avx512icl_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ vpbroadcastb m6, [tlq+r6] ; right
+ sub tlq, hq
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m7, [base+pb_127_m127]
+ vpbroadcastb m0, [tlq] ; bottom
+ vpbroadcastd m1, [base+pw_255]
+ add wq, r5
+ lea v_weightsq, [base+smooth_weights+hq*2]
+ vpmovb2m k1, m1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastd m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+ vpbroadcastq m9, [smooth_weights+4*2]
+ mova ym11, [smooth_endA]
+
+ punpcklbw m8, m0 ; top, bottom
+ pmaddubsw m10, m8, m7
+ paddw m1, m8 ; 1 * top + 256 * bottom + 255
+ paddw m10, m1 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+ vpbroadcastq m1, [tlq+hq-8]
+ vbroadcasti32x4 m0, [v_weightsq]
+ add v_weightsq, 16
+ mova m2, m6
+ vpshufb m2{k1}, m1, m4 ; left, right
+ pmaddubsw m1, m2, m7 ; 127 * left - 127 * right
+ pshufb m0, m5
+ pmaddubsw m0, m8, m0
+ paddw m1, m2 ; 128 * left + 129 * right
+ pmaddubsw m2, m9
+ paddw m0, m10
+ paddw m1, m2
+ pavgw m0, m1
+ vpermb m0, m11, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ sub hd, 8
+ jl .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.ret:
+ RET
+.w8:
+ vpbroadcastq m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+ vbroadcasti32x4 m9, [smooth_weights+8*2]
+ mova ym11, [smooth_endA]
+ punpcklbw m8, m0
+ pmaddubsw m10, m8, m7
+ paddw m1, m8
+ paddw m10, m1
+.w8_loop:
+ vpbroadcastd m1, [tlq+hq-4]
+ vpbroadcastq m0, [v_weightsq]
+ add v_weightsq, 8
+ mova m2, m6
+ vpshufb m2{k1}, m1, m4
+ pmaddubsw m1, m2, m7
+ pshufb m0, m5
+ pmaddubsw m0, m8, m0
+ paddw m1, m2
+ pmaddubsw m2, m9
+ paddw m0, m10
+ paddw m1, m2
+ pavgw m0, m1
+ vpermb m0, m11, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m9, [tlq+hq+1]
+ movsldup m5, [smooth_shuf]
+ movshdup m10, [smooth_shuf]
+ vbroadcasti32x4 m11, [smooth_weights+16*2]
+ vbroadcasti32x4 m12, [smooth_weights+16*3]
+ mova m15, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m13, m8, m7
+ pmaddubsw m14, m9, m7
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m13, m0
+ paddw m14, m1
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastq m1, [v_weightsq]
+ add v_weightsq, 8
+ mova m4, m6
+ vpshufb m4{k1}, m0, m5
+ pmaddubsw m2, m4, m7
+ pshufb m1, m10
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m11
+ pmaddubsw m4, m12
+ paddw m0, m13
+ paddw m1, m14
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m15, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m9, [tlq+hq+1]
+ movshdup m10, [smooth_shuf]
+ mova m12, [smooth_weights+32*2]
+ vpbroadcastd ym5, [pb_1]
+ mova m15, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m13, m8, m7
+ pmaddubsw m14, m9, m7
+ vshufi32x4 m11, m12, m12, q2020
+ vshufi32x4 m12, m12, q3131
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m13, m0
+ paddw m14, m1
+.w32_loop:
+ vpbroadcastd m0, [tlq+hq-2]
+ vpbroadcastd m1, [v_weightsq]
+ add v_weightsq, 4
+ mova m4, m6
+ vpshufb m4{k1}, m0, m5
+ pmaddubsw m2, m4, m7
+ pshufb m1, m10
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m11
+ pmaddubsw m4, m12
+ paddw m0, m13
+ paddw m1, m14
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m15, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ movu m9, [tlq+hq+1]
+ mova m11, [smooth_weights+64*2]
+ mova m2, [smooth_weights+64*3]
+ mova m14, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m12, m8, m7
+ pmaddubsw m13, m9, m7
+ vshufi32x4 m10, m11, m2, q2020
+ vshufi32x4 m11, m2, q3131
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m12, m0
+ paddw m13, m1
+.w64_loop:
+ mova m4, m6
+ vpbroadcastb m4{k1}, [tlq+hq-1]
+ vpbroadcastw m1, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m4, m7
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m10
+ pmaddubsw m4, m11
+ paddw m0, m12
+ paddw m1, m13
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m14, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
+ lea r6, [pal_pred_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ vbroadcasti32x4 m4, [palq]
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ packuswb m4, m4
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ pshufb xmm0, xm4, [idxq]
+ add idxq, 16
+ movd [dstq+strideq*0], xmm0
+ pextrd [dstq+strideq*1], xmm0, 1
+ pextrd [dstq+strideq*2], xmm0, 2
+ pextrd [dstq+stride3q ], xmm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pshufb xmm0, xm4, [idxq+16*0]
+ pshufb xmm1, xm4, [idxq+16*1]
+ add idxq, 16*2
+ movq [dstq+strideq*0], xmm0
+ movhps [dstq+strideq*1], xmm0
+ movq [dstq+strideq*2], xmm1
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pshufb m0, m4, [idxq]
+ add idxq, 64
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ pshufb m0, m4, [idxq+64*0]
+ pshufb m1, m4, [idxq+64*1]
+ add idxq, 64*2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ pshufb m0, m4, [idxq+64*0]
+ pshufb m1, m4, [idxq+64*1]
+ pshufb m2, m4, [idxq+64*2]
+ pshufb m3, m4, [idxq+64*3]
+ add idxq, 64*4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64
+ RET
+
+; The ipred_filter code processes 4x2 blocks in the following order
+; which increases parallelism compared to doing things row by row.
+; Some redundant blocks are calculated for w > 4.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 4 1 2 3 4 9 a b c
+; 2 2 3 2 3 4 5 2 3 4 5 a b c d
+; 3 3 4 3 4 5 6 3 4 5 6 b c d e
+; 4 4 5 4 5 6 7 4 5 6 7 c d e f
+; 5 5 6 5 6 7 8 5 6 7 8 d e f g
+; 6 6 7 6 7 8 9 6 7 8 9 e f g h
+; 7 7 8 7 8 9 a 7 8 9 a f g h i
+; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___
+; 9 9 a b h i j
+; a b i j
+; b j
+
+cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt
+%define base r6-filter_taps
+ lea r6, [filter_taps]
+%ifidn fltd, fltm
+ movzx fltd, fltb
+%else
+ movzx fltd, byte fltm
+%endif
+ vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0
+ movifnidn hd, hm
+ shl fltd, 6
+ vpbroadcastd m6, [base+pd_8]
+ vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __
+ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4
+ vbroadcasti32x4 m8, [r6+fltq+16*1]
+ vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __
+ vbroadcasti32x4 m10, [r6+fltq+16*3]
+ mova xmm0, xm6
+ vpdpbusd xmm0, xmm2, xm7
+ mova xmm1, xm6
+ vpdpbusd xmm1, xmm2, xm8
+ vpdpbusd xmm0, xmm3, xm9
+ vpdpbusd xmm1, xmm3, xm10
+ packssdw xmm0, xmm1
+ cmp wd, 8
+ jb .w4
+ vpbroadcastd ym2, [tlq+5]
+ mova m11, [base+filter_perm]
+ mov r5, 0xffffffffffff000f
+ psrldq xmm2, 1 ; __ t0
+ kmovq k1, r5 ; 0x000f
+ psraw xm5, xmm0, 4
+ packuswb xmm2, xm5 ; __ t0 a0 b0
+ pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1
+ je .w8
+ kxnorb k3, k3, k3 ; 0x00ff
+ vpbroadcastd xm3, [tlq-4]
+ kandnq k2, k3, k1 ; 0xffffffffffff0000
+ vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __
+ mova ym0, ym6
+ vpdpbusd ym0, ym2, ym7
+ mova ym1, ym6
+ vpdpbusd ym1, ym2, ym8
+ pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0
+ vpbroadcastd m2, [tlq+9]
+ vpdpbusd ym0, ym3, ym9
+ vpdpbusd ym1, ym3, ym10
+ vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __
+ kunpckbw k4, k1, k3 ; 0x0fff
+ packssdw ym0, ym1
+ psraw ym0, 4 ; a0 d0 a1 b1
+ packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1
+ pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2
+ vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ psrldq m0, m2, 1 ; __ d0 __ b0 __ t0
+ vpbroadcastd m2, [tlq+13]
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ mova m12, [base+filter_end]
+ lea r5d, [hq-6]
+ mov r6, dstq
+ cmovp hd, r5d ; w == 16 ? h : h - 6
+ packssdw m4, m1
+ psraw m4, 4 ; e0 f0 c1 d1 a2 b2
+ packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2
+ pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3
+.w16_loop:
+ vpbroadcastd xm3, [tlq-8]
+ vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __
+ mova m1, m6
+ vpdpbusd m1, m2, m7
+ mova m0, m6
+ vpdpbusd m0, m2, m8
+ sub tlq, 2
+ vpdpbusd m1, m3, m9
+ vpdpbusd m0, m3, m10
+ packssdw m1, m0
+ mova m0, m4
+ psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3
+ packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3
+ pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3
+ vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3
+ vextracti32x4 [dstq+strideq*0], m5, 2
+ vextracti32x4 [dstq+strideq*1], m5, 3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ cmp wd, 16
+ je .ret
+ mova xm13, [filter_perm+16]
+ mova xmm3, [r6+strideq*0]
+ punpckhdq xmm3, [r6+strideq*1]
+ vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3
+ pinsrb xm3, xmm3, [tlq+r5+16], 7
+ pshufb xm3, xm13
+ vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __
+ mova m0, m6
+ vpdpbusd m0, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kunpckbw k5, k3, k1 ; 0xff0f
+ lea r3, [strideq*3]
+ vpdpbusd m0, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m0, m1
+ psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3
+ packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3
+ vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3
+ vpbroadcastd ym2, [tlq+r5+21]
+ pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3
+ vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3
+ vextracti32x4 [dstq+strideq*0], m5, 2
+ vextracti32x4 [dstq+strideq*1], m5, 3
+ punpckhqdq xmm3, [r6+r3]
+ pinsrb xmm3, [r6+strideq*2+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kxnord k3, k3, k4 ; 0xfffff0ff
+ lea r4, [strideq*5]
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m4, m1
+ psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3
+ packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3
+ vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3
+ vpbroadcastd m2, [tlq+r5+25]
+ pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3
+ vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3
+ vextracti32x4 [dstq+strideq*2], m5, 2
+ vextracti32x4 [dstq+r3 ], m5, 3
+ punpckhqdq xmm3, [r6+r4]
+ pinsrb xmm3, [r6+strideq*4+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __
+ mova m0, m6
+ vpdpbusd m0, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kunpckwd k1, k1, k2 ; 0x000f0000
+ vpdpbusd m0, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m0, m1
+ psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3
+ packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3
+ vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3
+ vpbroadcastd m2, [tlq+r5+29]
+ pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7
+ vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3
+ vextracti32x4 [dstq+strideq*4], m5, 2
+ vextracti32x4 [dstq+r4 ], m5, 3
+ lea r0, [strideq+r3*2]
+.w32_loop:
+ punpckhqdq xmm3, [r6+r0]
+ pinsrb xmm3, [r6+r3*2+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __
+.w32_loop_tail:
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m4, m1
+ mova m1, m0
+ psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7
+ packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7
+ pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7
+ vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7
+ vextracti32x4 [r6+strideq*0+16], m5, 2
+ vextracti32x4 [r6+strideq*1+16], m5, 3
+ lea r6, [r6+strideq*2]
+ sub r5d, 2
+ jg .w32_loop
+ vpermb m3, m11, m1
+ cmp r5d, -6
+ jg .w32_loop_tail
+.ret:
+ RET
+.w8:
+ vpermb ym3, ym11, ymm2
+.w8_loop:
+ vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __
+ mova ym0, ym6
+ vpdpbusd ym0, ym2, ym7
+ mova ym1, ym6
+ vpdpbusd ym1, ym2, ym8
+ sub tlq, 2
+ vpdpbusd ym0, ym3, ym9
+ vpdpbusd ym1, ym3, ym10
+ mova ym3, ym5
+ packssdw ym0, ym1
+ psraw ym5, ym0, 4 ; c0 d0 a1 b1
+ packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1
+ pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1
+ vpermb ym3, ym11, ym3 ; a0 a1 b0 b1
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w4_loop:
+ vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __
+ mova xmm0, xm6
+ vpdpbusd xmm0, xmm2, xm7
+ mova xmm1, xm6
+ vpdpbusd xmm1, xmm2, xm8
+ sub tlq, 2
+ vpdpbusd xmm0, xmm3, xm9
+ vpdpbusd xmm1, xmm3, xm10
+ packssdw xmm0, xmm1
+.w4:
+ psraw xmm0, 4 ; a0 b0
+ packuswb xmm0, xmm0
+ movd [dstq+strideq*0], xmm0
+ pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0
+ movd [dstq+strideq*1], xmm2
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm
new file mode 100644
index 0000000000..67e90b79ae
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@@ -0,0 +1,5409 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7
+z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
+z_filter_wh4: db 7, 7, 19, 7,
+z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
+pd_32768: dd 32768
+z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
+z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
+z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
+ db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
+z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11
+z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
+z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
+pw_m1to4: dw -1, -2, -3, -4
+z_filter_k: times 4 db 0, 16
+ times 4 db 0, 20
+ times 4 db 8, 16
+ times 4 db 32, 16
+ times 4 db 24, 20
+ times 4 db 16, 16
+ times 4 db 0, 0
+ times 4 db 0, 0
+pw_8: times 8 db 8, 0
+pb_3: times 16 db 3
+pb_16: times 16 db 16
+pw_62: times 8 dw 62
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_512: times 8 dw 512
+pw_m256: times 8 dw -256
+pb_2: times 8 db 2
+pb_4: times 8 db 4
+pb_8: times 8 db 8
+pb_128: times 8 db 128
+pb_m16: times 8 db -16
+pw_128: times 4 dw 128
+pw_255: times 4 dw 255
+pb_36_m4: times 4 db 36, -4
+pb_127_m127: times 4 db 127, -127
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
+
+JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
+ pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
+ punpcklqdq m1, m1
+ mova [dstq + %2], m1
+%if %1 > 16
+ mova [dstq + 16 + %2], m1
+%endif
+%if %1 > 32
+ mova [dstq + 32 + %2], m1
+ mova [dstq + 48 + %2], m1
+%endif
+%endmacro
+
+%macro IPRED_H 1 ; width
+ sub tlq, 4
+ movd m0, [tlq] ; get 4 bytes of topleft data
+ punpcklbw m0, m0 ; extend 2 byte
+%if %1 == 4
+ pshuflw m1, m0, q2233
+ movd [dstq+strideq*0], m1
+ psrlq m1, 32
+ movd [dstq+strideq*1], m1
+ pshuflw m0, m0, q0011
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+stride3q ], m0
+
+%elif %1 == 8
+ punpcklwd m0, m0
+ punpckhdq m1, m0, m0
+ punpckldq m0, m0
+ movq [dstq+strideq*1], m1
+ movhps [dstq+strideq*0], m1
+ movq [dstq+stride3q ], m0
+ movhps [dstq+strideq*2], m0
+%else
+ IPRED_SET %1, 0, q3333
+ IPRED_SET %1, strideq, q2222
+ IPRED_SET %1, strideq*2, q1111
+ IPRED_SET %1, stride3q, q0000
+%endif
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+%endmacro
+
+INIT_XMM ssse3
+cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_h_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4
+.w8:
+ IPRED_H 8
+.w16:
+ IPRED_H 16
+.w32:
+ IPRED_H 32
+.w64:
+ IPRED_H 64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+17]
+ movu m2, [tlq+33]
+ movu m3, [tlq+49]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+20]
+ pcmpeqd m3, m3
+ psrlw m4, 1 ; dc = (width + height) >> 1;
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s4:
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s8:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+.s32:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q], m0
+ mova [dstq+stride3q+16], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-48]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-32]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-16]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+17]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+33]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+49]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w64_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq+32], m2
+ mova [dstq+strideq+48], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_ssse3_table
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, r6d
+ psrld m3, m2
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+48] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
+ pmaddubsw m6, m%3, m%1
+ pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b
+ paddw m6, m%5
+ paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
+ psrlw m6, 8
+ psrlw m0, 8
+ packuswb m6, m0
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_ssse3_table
+ LEA r6, ipred_smooth_v_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ movd m5, [tlq+hq]
+ pxor m2, m2
+ pshufb m5, m2
+ add wq, r6
+ jmp wq
+.w4:
+ movd m2, [tlq+1]
+ punpckldq m2, m2
+ punpcklbw m2, m5 ; top, bottom
+ lea r3, [strideq*3]
+ mova m4, [base+ipred_v_shuf]
+ mova m5, m4
+ punpckldq m4, m4
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom
+ paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128
+.w4_loop:
+ movu m1, [weightsq+hq*2]
+ pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movd [dstq+strideq*0], m6
+ pshuflw m1, m6, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m6, m6
+ movd [dstq+strideq*2], m6
+ psrlq m6, 32
+ movd [dstq+r3 ], m6
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1 ; m3 is output for loop
+.w8_loop:
+ movq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movq [dstq+strideq*0], m6
+ movhps [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1 ; m4 and m5 is output for loop
+.w16_loop:
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add dstq, strideq
+ add hq, 1
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w32_loop_init:
+ mov r3d, 2
+.w32_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub tlq, 32
+ add hq, 1
+ jl .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w64_loop_init:
+ mov r3d, 4
+.w64_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub tlq, 64
+ add hq, 1
+ jl .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_ssse3_table
+ LEA r6, ipred_smooth_h_ssse3_table
+ mov wd, wm
+ movd m3, [tlq+wq]
+ pxor m1, m1
+ pshufb m3, m1 ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+pb_127_m127]
+ movddup m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ movddup m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ punpckldq m7, m7
+.w8_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m6, [base+smooth_weights+16*2]
+ mova m7, [base+smooth_weights+16*3]
+ sub tlq, 1
+ sub tlq, hq
+.w16_loop:
+ pxor m1, m1
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m1
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w32_loop_init:
+ mov r5, 2
+ lea r3, [base+smooth_weights+16*4]
+.w32_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub hd, 1
+ jg .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w64_loop_init:
+ mov r5, 4
+ lea r3, [base+smooth_weights+16*8]
+.w64_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub hd, 1
+ jg .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3
+ pmaddubsw m6, m%3, m%1
+ mova m0, m6
+ pmaddubsw m6, m%4, m%2
+ mova m1, m6
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+%ifnum %7
+%else
+ mova m3, %7
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
+ mova m1, [rsp+16*%1] ; top
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pmaddubsw m2, m1, m5
+ mova [rsp+16*%2], m1
+ paddw m1, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m1 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%3], m2
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*%4], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%5], m2
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m3, m2
+ pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, %7
+ paddw m2, m3, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ mova m7, [rsp+16*%9]
+ pshufb m1, m7
+ mova [rsp+16*%8], m3
+ mova m4, [rsp+16*%2]
+ mova m5, [rsp+16*%3]
+ mova m3, [rsp+16*%4]
+ mova m7, [rsp+16*%5]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8]
+ mova [dstq], m0
+ movddup m3, [base+pw_255] ; recovery
+ mova m0, [rsp+16*%10] ; recovery
+ mova m4, [rsp+16*%11] ; recovery
+ mova m5, [rsp+16*%12] ; recovery
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_ssse3_table
+ mov wd, wm
+ mov hd, hm
+ LEA r6, ipred_smooth_ssse3_table
+ movd m4, [tlq+wq] ; right
+ pxor m2, m2
+ pshufb m4, m2
+ tzcnt wd, wd
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pb_127_m127]
+ movd m0, [r5]
+ pshufb m0, m2 ; bottom
+ movddup m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
+ jmp wq
+.w4:
+ mova m7, [base+ipred_v_shuf]
+ movd m1, [tlq+1] ; left
+ pshufd m1, m1, q0000
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m6, m7, q1100
+ pshufd m7, m7, q3322
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width];
+ punpcklqdq m1, m1
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w4_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m7, [base+ipred_v_shuf]
+ movq m1, [tlq+1] ; left
+ punpcklqdq m1, m1
+ sub tlq, 4
+ sub tlq, hq
+ punpcklbw m1, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m2, m1, m5
+ paddw m3, m1
+ paddw m2, m3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w8_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ pshufd m1, m1, q1100
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 4
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m7, [base+ipred_v_shuf]
+ movu m1, [tlq+1] ; left
+ sub tlq, 4
+ sub tlq, hq
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*5], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*6], m2
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ mova [rsp+16*0], m1
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*1], m2
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m5
+.w16_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m0, m1
+ mova m3, m2
+ pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, [base+smooth_weights+16*3]
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 2
+ mova m7, [rsp+16*2]
+ pshufb m1, m7
+ mova [rsp+16*7], m3
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ mova m3, [rsp+16*5]
+ mova m7, [rsp+16*6]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
+ mova m4, [rsp+16*3]
+ mova m5, [rsp+16*4]
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w32_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
+ lea dstq, [dstq-16+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movu m1, [tlq+33] ; top
+ movu m2, [tlq+49] ; top
+ mova [rsp+16*11], m1
+ mova [rsp+16*12], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w64_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
+ lea dstq, [dstq-48+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w64_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+%else
+cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define strideq r3
+ %define stridemp dword [rsp+16*12]
+ mov stridemp, r1
+ LEA r1, $$
+%endif
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ inc tlq
+ movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ lea wq, [base+wq+ipred_z1_ssse3_table]
+ movzx dxd, word [base+dr_intra_derivative+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ jmp wq
+.w4:
+ lea r3d, [angleq+88]
+ test r3d, 0x480
+ jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r3d, 9
+ add r3d, hd
+ cmp r3d, 8
+ jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
+ mova m1, [tlq-1]
+ pshufb m0, m1, [base+z_upsample1]
+ pshufb m1, [base+z_upsample2]
+ movddup m2, [base+pb_36_m4]
+ add dxd, dxd
+ pmaddubsw m0, m2
+ pshufd m7, m1, q3333
+ movd [rsp+16], m7 ; top[max_base_x]
+ pmaddubsw m1, m2
+ movd m6, dxd
+ mov r5d, dxd ; xpos
+ pshufb m6, [base+pw_256]
+ paddw m1, m0
+ movq m0, [tlq]
+ pmulhrsw m1, m10
+ paddw m7, m6, m6
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ packuswb m1, m1
+ punpcklbw m0, m1
+ movifnidn strideq, stridemp
+ mova [rsp], m0
+.w4_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movq m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movhps m0, [rsp+r2]
+ pand m2, m8, m6 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_upsample_loop
+ RET
+.w4_no_upsample:
+ mov r3d, 7 ; max_base
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea r3d, [hq+3]
+ movd m0, r3d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r3d, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ mova m3, [tlq-1]
+ imul r5d, 0x55555555
+ movu m7, [base+z_filter_s+8]
+ shr r5d, 30 ; filter_strength
+ movddup m0, [base+pb_8]
+ pminub m7, m0
+ pshufb m0, m3, [base+z_filter_s]
+ movddup m4, [base+z_filter_k-8+r5*8+24*0]
+ pshufb m3, m7
+ movddup m5, [base+z_filter_k-8+r5*8+24*1]
+ shufps m2, m0, m3, q2121
+ movddup m6, [base+z_filter_k-8+r5*8+24*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m2, m4
+ pmaddubsw m2, m5
+ paddd m5, m6
+ pmaddubsw m4, m3, m5
+ pmaddubsw m3, m6
+ paddw m0, m2
+ paddw m1, m4
+ paddw m0, m3
+ pshufd m1, m1, q3333
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ mov r5d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne r3d, r5d
+ packuswb m0, m1
+ mova [tlq], m0
+.w4_main:
+ add tlq, r3
+ movd m5, dxd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ movd m7, [tlq] ; top[max_base_x]
+ shl r3d, 6
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd ; xpos
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ mova m3, [base+z1_shuf_w4]
+ paddw m6, m5, m5
+ psubw m4, m0 ; max_base_x
+ punpcklqdq m5, m6 ; xpos0 xpos1
+.w4_loop:
+ lea r3, [r5+dxq]
+ sar r5, 6 ; base0
+ movq m0, [tlq+r5]
+ lea r5, [r3+dxq]
+ sar r3, 6 ; base1
+ movhps m0, [tlq+r3]
+ pand m2, m8, m5 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ movifnidn strideq, stridemp
+ pcmpgtw m1, m4, m5 ; base < max_base_x
+ pmulhrsw m0, m10
+ paddw m5, m6 ; xpos += dx
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ test r5d, r5d
+ jl .w4_loop
+ packuswb m7, m7
+.w4_end_loop:
+ movd [dstq+strideq*0], m7
+ movd [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ lea r3d, [angleq+88]
+ and r3d, ~0x7f
+ or r3d, hd
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ mova m5, [base+z_upsample1]
+ movu m3, [base+z_filter_s+6]
+ movd m4, hd
+ mova m0, [tlq-1]
+ movu m1, [tlq+7]
+ pxor m7, m7
+ pshufb m4, m7
+ movddup m7, [base+pb_36_m4]
+ pminub m4, m3
+ add dxd, dxd
+ pshufb m2, m0, m5
+ pmaddubsw m2, m7
+ pshufb m0, m3
+ pmaddubsw m0, m7
+ movd m6, dxd
+ pshufb m3, m1, m5
+ pmaddubsw m3, m7
+ pshufb m1, m4
+ pmaddubsw m1, m7
+ pshufb m6, [base+pw_256]
+ mov r5d, dxd
+ paddw m2, m0
+ paddw m7, m6, m6
+ paddw m3, m1
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ movu m1, [tlq]
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ packuswb m2, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movifnidn strideq, stridemp
+ mova [rsp+16*0], m0
+ mova [rsp+16*1], m1
+.w8_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movu m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu m1, [rsp+r2]
+ pand m2, m8, m6
+ psubw m3, m9, m2
+ psllw m2, 8
+ por m3, m2
+ punpcklqdq m2, m3, m3 ; frac0
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3 ; frac1
+ pmaddubsw m1, m3
+ paddw m6, m7
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ movd m0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ test angled, 0x400
+ jnz .w8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ movd m3, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x55555555
+ movu m1, [tlq+16*1]
+ shr r5d, 30 ; filter_strength
+ movd m2, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ sub r5, 3
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ mova [tlq-16*2], m3
+ movq [tlq+r3-15], m2
+ call .filter_edge
+ sar r5d, 1
+ add r5d, 17
+ cmp hd, 8
+ cmova r3d, r5d
+.w8_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ shl r3d, 6
+ movu m3, [base+z_filter_s+2]
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z_base_inc]
+ mova m6, m5
+.w8_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3]
+ pand m1, m8, m5
+ psubw m2, m9, m1
+ psllw m1, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [dstq], m0
+ dec hd
+ jz .w8_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w8_loop
+ packuswb m7, m7
+.w8_end_loop:
+ movq [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16:
+ lea r3d, [hq+15]
+ movd m0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ test angled, 0x400
+ jnz .w16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x24924924
+ movu m1, [tlq+16*1]
+ shr r5d, 30
+ movd m2, [tlq+30]
+ adc r5, -4 ; filter_strength-3
+ movd m3, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m4, m7
+ movd [rsp], m2
+ pshufb m3, m7
+ mova [tlq-16*2], m4
+ movd [tlq+r3-16], m3
+ call .filter_edge
+ cmp hd, 16
+ jle .w16_main
+ pshuflw m0, [rsp], q0000
+ sar r5, 1
+ movd m1, [base+z_filter_k_tail+4+r5*4]
+ lea r3d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+32], m0
+.w16_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ movd m4, r3d
+ shl r3d, 6
+ pshufb m5, [base+pw_256]
+ pxor m6, m6
+ pshufb m7, m6
+ mov r5d, dxd
+ pshufb m4, m6
+ sub r5, r3
+ psubb m4, [base+pb_0to15]
+ mova m6, m5
+.w16_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m3, m5, 6
+ packsswb m3, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ paddw m5, m6
+ pcmpgtb m2, m4, m3
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq], m0
+ dec hd
+ jz .w16_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w16_loop
+.w16_end_loop:
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ lea r3d, [hq+31]
+ and r3d, 31
+ or r3d, 32 ; imin(h+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ movd m6, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ movd m4, [tlq+62]
+ movd m5, [tlq+r3]
+ lea tlq, [rsp+16*6]
+ mova [tlq-16*3], m0
+ pxor m7, m7
+ mova [tlq-16*2], m1
+ pshufb m6, m7
+ mova [tlq-16*1], m2
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq+16*0], m3
+ movd [rsp], m4
+ pshufb m5, m7
+ mova [tlq-16*4], m6
+ movd [tlq+r3-48], m5
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ cmp hd, 32
+ jle .w32_main
+ pshuflw m0, [rsp], q0000
+ movd m1, [base+z_filter_k_tail+4]
+ add r3d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+64], m0
+.w32_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ pshufb m5, [base+pw_256]
+ sub r5, r3
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ mova m6, m5
+.w32_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*1], m0
+ dec hd
+ jz .w32_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ lea r3d, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ mova [rsp+16*3], m0
+ pxor m7, m7
+ mova [rsp+16*4], m1
+ pshufb m4, m7
+ mova [rsp+16*5], m2
+ mova [rsp+16*6], m3
+ mova [rsp+16*2], m4
+ movu m0, [tlq+16*4]
+ movu m1, [tlq+16*5]
+ movu m2, [tlq+16*6]
+ movu m3, [tlq+16*7]
+ movd m4, [tlq+r3]
+ lea tlq, [rsp+16*10]
+ mova [tlq-16*3], m0
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*2], m1
+ pshufb m4, m7
+ mova [tlq-16*1], m2
+ mova [tlq+16*0], m3
+ movd [tlq+r3-16*7], m4
+ cmp hd, 64
+ jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
+ call .filter_edge
+.w64_filter96:
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+.w64_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ sub r5, r3
+ pshufb m5, [base+pw_256]
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ paddb m0, m1
+ mova [rsp+16*2], m0
+ paddb m0, m1
+ mova [rsp+16*3], m0
+ mova m6, m5
+.w64_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*2+0]
+ movu m2, [tlq+r3+16*2+1]
+ mova [dstq+16*1], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*2], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*3+0]
+ movu m2, [tlq+r3+16*3+1]
+ mova [dstq+16*2], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*3], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*3], m0
+ dec hd
+ jz .w64_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ mova [dstq+16*2], m7
+ mova [dstq+16*3], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+ALIGN function_align
+.filter_edge: ; 32 pixels/iteration
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m2, [tlq-18]
+ movu m1, [tlq-17]
+ movu m3, [tlq- 2]
+ movu m4, [tlq- 1]
+ punpcklbw m0, m2, m1
+ pmaddubsw m0, m7
+ punpckhbw m2, m1
+ pmaddubsw m2, m7
+ punpcklbw m1, m3, m4
+ pmaddubsw m1, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m5, [tlq-16]
+ movu m6, [tlq-15]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ mova m5, [tlq+ 0]
+ movu m6, [tlq+ 1]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m1, m4
+ paddw m3, m5
+ test r5d, r5d
+ jnz .filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m5, [tlq-14]
+ movu m6, [tlq+ 2]
+ punpcklbw m4, m5, m5
+ pmaddubsw m4, m7
+ punpckhbw m5, m5
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ punpcklbw m5, m6, m6
+ pmaddubsw m5, m7
+ punpckhbw m6, m6
+ pmaddubsw m6, m7
+ paddw m1, m5
+ paddw m3, m6
+.filter_end:
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m10}, m0, m2, m1, m3
+%else
+ mova m4, m10
+ REPX {pmulhrsw x, m4 }, m0, m2, m1, m3
+%endif
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [tlq+16*0], m0
+ mova [tlq+16*1], m1
+ ret
+
+%if ARCH_X86_64
+cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
+ %define base r7-$$
+ %define maxwm r6m
+ %define maxhm r7m
+ lea r7, [$$]
+ mov hd, hm
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ lea r9d, [wq-4]
+ mova m10, [base+pw_512]
+ shl r9d, 6
+ mova m11, [base+z1_shuf_w4]
+ or r9d, hd
+ mova m12, [base+z2_h_shuf]
+%else
+cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define m11 [rsp+16*16]
+ %define m12 [rsp+16*17]
+ %define r8 [rsp+16*6+4*1]
+ %define r9b byte [rsp+16*18+4*0]
+ %define r9d dword [rsp+16*18+4*0]
+ %define r10d dword [rsp+16*18+4*1]
+ %define r11d dword [rsp+16*18+4*2]
+ %define maxwm [rsp+16*18+4*3]
+ %define maxhm [rsp+16*19+4*0]
+ %define stridemp [rsp+16*19+4*1]
+ %define strideq r3
+ %define dyd r4
+ %define dyq r4
+ mov stridemp, r1
+ mov r1d, r6m
+ mov r4d, r7m
+ mov maxwm, r1d
+ mov maxhm, r4d
+ LEA r1, $$
+ lea hd, [wq-4]
+ mova m0, [base+z1_shuf_w4]
+ shl hd, 6
+ mova m1, [base+z2_h_shuf]
+ or hd, hm
+ mova m11, m0
+ mov r9d, hd
+ mova m12, m1
+%endif
+ tzcnt wd, wd
+ movifnidn angled, anglem
+ movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
+%if ARCH_X86_64
+ movzx dxd, angleb
+%else
+ movzx dxd, byte anglem
+%endif
+ xor angled, 0x400
+ mova m0, [tlq-16*4]
+ mov dyd, dxd
+ mova m1, [tlq-16*3]
+ neg dxq
+ mova m2, [tlq-16*2]
+ and dyd, ~1
+ mova m3, [tlq-16*1]
+ and dxq, ~1
+ movd m4, [tlq]
+ movu m5, [tlq+16*0+1]
+ movu m6, [tlq+16*1+1]
+ movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
+ movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
+ mova [rsp+16*2], m0
+ pxor m7, m7
+ mova [rsp+16*3], m1
+ pshufb m4, m7
+ mova [rsp+16*4], m2
+ lea wq, [base+ipred_z2_ssse3_table+wq]
+ mova [rsp+16*5], m3
+ neg dxd
+ mova [rsp+16*6], m4
+ or dyd, 4<<16
+ mova [rsp+16*7], m4
+ mova [rsp+16*8], m5
+ mova [rsp+16*9], m6
+ movq m0, [base+z_base_inc+2]
+ movsldup m1, [base+z2_dy_offset]
+ movq m2, [base+pw_256] ; 4<<6
+ movq [rsp+16*14+8*0], m0
+ movq [rsp+16*15+8*0], m1
+ movq [rsp+16*15+8*1], m2
+%if ARCH_X86_64
+ lea r10d, [dxq+(128<<6)] ; xpos
+%else
+ mov [rsp+16*7+4*1], dyd
+ lea r4d, [dxq+(128<<6)]
+ mov r10d, r4d
+ movzx hd, r9b
+%endif
+ mov r11d, (128-4)<<6
+ jmp wq
+.w4:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+4]
+ lea r3d, [hq+2]
+ add angled, 1022
+ pshufb m5, m7
+ shl r3d, 6
+ movd [rsp+16*8+4], m5
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, [base+z_filter_wh4]
+ pand m6, m0
+ pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
+ jmp .w8_filter_left
+.upsample_above: ; w4/w8
+ movq m3, [rsp+gprsize+16*8-2]
+ movq m1, [rsp+gprsize+16*8-1]
+ movq m0, [rsp+gprsize+16*8+0]
+ movq m4, [rsp+gprsize+16*8+1]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m3
+ punpcklbw m2, m0, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+%if ARCH_X86_64
+ mova m11, [base+pb_0to15]
+ lea r10d, [r10+dxq+(1<<6)]
+ mov r11d, (128-7)<<6
+%else
+ mova m3, [base+pb_0to15]
+ mov r3d, [rsp+gprsize+16*18+4*1]
+ mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
+ lea r3d, [r3+dxq+(1<<6)]
+ mov [rsp+gprsize+16*18+4*1], r3d
+ mova [rsp+gprsize+16*16], m3
+%endif
+ add dxd, dxd
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq m2, [rsp+gprsize+16*14]
+ paddw m2, m2
+ movq [rsp+gprsize+16*14], m2
+ packuswb m1, m1
+ punpcklbw m1, m0
+ mova [rsp+gprsize+16*8], m1
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ mov [rsp], angled
+ sub angled, 1112 ; angle - 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh4]
+ mova m4, [base+z_filter_t_w48+angleq*8]
+ call .w8_filter_top
+ mov angled, [rsp]
+ lea r3d, [hq+2]
+ sub angled, 139
+ shl r3d, 6
+ test r3d, angled
+ jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+.upsample_left: ; w4/w8
+ neg hq
+ movd m0, [tlq+hq]
+ pshufb m0, m7
+ movd [rsp+16*6+hq-4], m0
+ movq m3, [rsp+16*5+7]
+ movq m0, [rsp+16*5+8]
+ movq m2, [rsp+16*5+9]
+ movq m4, [rsp+16*5+10]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m0, m3
+ punpcklbw m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ movshdup m3, [base+z2_dy_offset]
+%if ARCH_X86_64
+ mova m12, [base+z2_upsample]
+ add dyd, dyd
+%else
+ mova m4, [base+z2_upsample]
+ shl dword [rsp+16*7+4*1], 1
+ mova m12, m4
+%endif
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq [rsp+16*15], m3
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp+16*5], m0
+.w4_main:
+ movd m6, dxd
+%if ARCH_X86_64
+ movd m3, dyd
+%else
+ movd m3, [rsp+16*7+4*1]
+%endif
+ movddup m0, [rsp+16*14+8*0]
+ pshufb m6, [base+pw_256]
+ paddw m7, m6, m6
+ movq m5, [base+pw_m1to4]
+ pshuflw m4, m3, q0000
+ punpcklqdq m6, m7
+ pmullw m4, m5
+ pshuflw m3, m3, q1111
+ paddw m6, m0
+ pshuflw m0, m4, q3333
+ psubw m4, [rsp+16*15]
+ movq [rsp+16*6+8*1], m3
+ movq [rsp+8*1], m0 ; dy*4
+%if ARCH_X86_64
+ mov r8, dstq
+%endif
+.w4_loop0:
+%if ARCH_X86_32
+ mov r8, dstq
+%endif
+ mova [rsp+16*12], m6
+ mov r2d, r10d
+ movq [rsp+8*0], m4
+ pand m0, m4, m8
+ psraw m4, 6
+ psubw m1, m9, m0
+ psllw m0, 8
+ por m0, m1 ; 64-frac_y, frac_y
+ movq [rsp+8*3], m0
+ pabsw m4, m4
+ movq [rsp+8*2], m4
+ movzx hd, r9b
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movq m0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movhps m0, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps m1, [rsp+r3]
+ pand m2, m8, m6
+ paddsw m5, m6, m7
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m0, m11
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m8, m5
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m1, m11
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 127 ; topleft
+ jge .w4_toponly
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m4, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m4, [rsp+r3]
+ pshufb m3, m12
+ pshufb m4, m12
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ movddup m4, [rsp+8*3]
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ psraw m6, 15 ; base_x < topleft
+ pand m2, m6
+ pandn m6, m0
+ por m0, m2, m6
+ psraw m6, m5, 15
+ pand m3, m6
+ pandn m6, m1
+ por m1, m3, m6
+.w4_toponly:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ movifnidn strideq, stridemp
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ sub hd, 4
+ jz .w4_end
+ movq m4, [rsp+8*2]
+ movq m3, [rsp+16*6+8*1]
+ paddw m6, m5, m7 ; xpos += dx
+ psubw m4, m3
+ movq [rsp+8*2], m4
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r11d
+ jge .w4_loop
+ movddup m5, [rsp+8*3]
+.w4_leftonly_loop:
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m1, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m1, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m2, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m2, [rsp+r3]
+ psubw m4, m3
+ pshufb m1, m12
+ pshufb m2, m12
+ movq [rsp+8*2], m4
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ movifnidn strideq, stridemp
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ sub r9d, 1<<8
+ jl .w4_ret
+ movq m4, [rsp+8*1]
+%if ARCH_X86_64
+ add r8, 4
+ mov dstq, r8
+%else
+ mov dstq, r8
+ add dstq, 4
+%endif
+ paddw m4, [rsp+8*0] ; base_y += 4*dy
+ movzx r3d, word [rsp+16*15+8*1]
+ add r10d, r3d
+ movddup m6, [rsp+16*15+8*1]
+ paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
+ jmp .w4_loop0
+.w4_ret:
+ RET
+.w8:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+8]
+ lea r3d, [angleq+126]
+ pshufb m5, m7
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ movd [rsp+16*8+8], m5
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ call .upsample_above
+ sub angled, 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ movu m1, [base+z_filter_wh8]
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ psrldq m2, [base+z_filter_t_w48+angleq*8], 4
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, m1
+ pand m6, m0
+ pcmpgtb m6, m2
+%if ARCH_X86_64
+ movq [rsp+16*15+8*1], m10 ; 8<<6
+%else
+ movq m0, m10
+ movq [rsp+16*15+8*1], m0
+%endif
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ mov [rsp], angled
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh8]
+ psrldq m4, [base+z_filter_t_w48+angleq*8], 4
+ call .w8_filter_top
+ mov r3d, [rsp]
+ sub r3d, 141
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ cmp r3d, 8
+ jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
+.w8_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x55555555
+ mov r3, tlq
+ shr r5d, 30
+ sub r5, 3 ; filter_strength-3
+ jmp .filter_left
+.w8_filter_top:
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ pcmpeqb m0, m3
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m4
+ pcmpgtb m6, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_filter_top_end ; filter_strength == 0
+ imul r5d, 0x55555555
+ movq m0, [rsp+gprsize+16*8-2]
+ shr r5d, 30
+ movq m1, [rsp+gprsize+16*8-1]
+ sub r5, 3 ; filter_strength-3
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ punpcklbw m0, m1
+ pmaddubsw m0, m7
+ movq m1, [rsp+gprsize+16*8+0]
+ movq m2, [rsp+gprsize+16*8+1]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ punpcklbw m1, m2
+ pmaddubsw m1, m7
+ movq m2, [rsp+gprsize+16*8+2]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
+ punpcklbw m2, m2
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m0, m2
+%if ARCH_X86_64
+ mov r3d, r7m ; maxw, offset due to call
+%else
+ mov r3d, [rsp+gprsize+16*18+4*3]
+%endif
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [rsp+gprsize+16*8], m0
+ cmp r3d, 8
+ jge .w8_filter_top_end
+ movq m0, [tlq+r3+1]
+ movq [rsp+gprsize+r3+16*8], m0
+.w8_filter_top_end:
+ ret
+.w16:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m0, [base+z_filter_wh16]
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m3
+ pcmpgtb m6, m3
+ pmovmskb r5d, m1
+ mov r3, tlq
+ test r5d, r5d
+ jz .w16_filter_left ; filter_strength == 0
+ imul r5d, 0x24924924
+ pshufb m5, [base+z_filter_t_w16] ; tlq[16]
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ movd [rsp+16*9], m5
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m1, [rsp+16*8-2]
+ movu m2, [rsp+16*8-1]
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m7
+ punpckhbw m1, m2
+ pmaddubsw m1, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m3, [rsp+16*8+0]
+ movu m4, [rsp+16*8+1]
+ punpcklbw m2, m3, m4
+ pmaddubsw m2, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ test r5d, r5d
+ jnz .w16_filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m3, [rsp+16*8+2]
+ punpcklbw m2, m3, m3
+ pmaddubsw m2, m7
+ punpckhbw m3, m3
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+.w16_filter_end:
+ mov r2d, maxwm
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*8], m0
+ cmp r2d, 16
+ jge .w16_filter_left
+ movu m0, [r3+r2+1]
+ movu [rsp+r2+16*8], m0
+.w16_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x24924924
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ jmp .filter_left
+.w32:
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m6, [base+z_filter_t_w16] ; tlq[32]
+ mov r3, tlq
+ lea tlq, [rsp+16*9]
+ movd [tlq+16*1], m6
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mov r2d, maxwm
+ mova [rsp+16*8], m0
+ mova [rsp+16*9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16*8], m0
+ movu [rsp+r2+16*9], m1
+ jmp .filter_left
+.w64:
+ movu m0, [tlq+16*2+1]
+ movu m1, [tlq+16*3+1]
+ mova [rsp+16*10], m0
+ mova [rsp+16*11], m1
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m1, [base+z_filter_t_w16] ; tlq[64]
+ mov r3, tlq
+ lea tlq, [rsp+16*11]
+ movd [tlq+16*1], m1
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova m2, [tlq+16*2]
+ mova m3, [tlq+16*3]
+ mov r2d, maxwm
+ mova [rsp+16* 8], m0
+ mova [rsp+16* 9], m1
+ mova [rsp+16*10], m2
+ mova [rsp+16*11], m3
+ cmp r2d, 64
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16* 8], m0
+ movu [rsp+r2+16* 9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*2+1]
+ movu m1, [r3+r2+16*3+1]
+ movu [rsp+r2+16*10], m0
+ movu [rsp+r2+16*11], m1
+.filter_left:
+ neg hq
+ movd m0, [r3+hq]
+ pxor m1, m1
+ pshufb m0, m1
+ movd [rsp+16*6+hq-4], m0
+ lea tlq, [rsp+16*5]
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ cmp hd, -32
+ jge .filter_left_end
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova [rsp+16*2], m0
+ mova [rsp+16*3], m1
+.filter_left_end:
+ mov r2d, maxhm
+ mova m0, [rsp+16*5]
+ mova m1, [rsp+16*6]
+ mova m2, [rsp+16*7]
+ neg r2
+ mova [rsp+16*4], m0
+ mova [rsp+16*5], m1
+ mova [rsp+16*6], m2
+ cmp r2d, hd
+ jle .w4_main
+ movu m0, [r3+r2-16*2]
+ movu m1, [r3+r2-16*1]
+ movu [rsp+r2+16*4], m0
+ movu [rsp+r2+16*5], m1
+ cmp r2d, -32
+ jle .w4_main
+ movu m0, [r3+r2-16*4]
+ movu m1, [r3+r2-16*3]
+ movu [rsp+r2+16*2], m0
+ movu [rsp+r2+16*3], m1
+ jmp .w4_main
+
+%if ARCH_X86_64
+cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+ mov org_wd, wd
+%else
+cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define org_wd r5
+ %define org_wq r5
+ mov [dstq+strideq*0], strideq
+ mov [dstq+strideq*1], wd
+ LEA r1, $$
+%endif
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ dec tlq
+ movsxd hq, [base+ipred_z3_ssse3_table+hq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ lea hq, [base+ipred_z3_ssse3_table+hq]
+ movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
+ jmp hq
+.h4:
+ lea r4d, [angleq+88]
+ test r4d, 0x480
+ jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r4d, 9
+ add r4d, wd
+ cmp r4d, 8
+ jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
+ movu m3, [tlq-7]
+ movu m1, [base+z_upsample1-4]
+ movu m4, [base+z_filter_s+2]
+ pshufb m0, m3, m1
+ pxor m1, m1
+ pshufb m2, m3, m1
+ pshufb m1, m3, m4
+ mova [rsp+16], m2 ; top[max_base_y]
+ movddup m2, [base+pb_36_m4]
+ add dyd, dyd
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ movd m5, dyd
+ mov r5d, dyd
+ pshufb m5, [base+pw_256]
+ paddw m0, m1
+ pmulhrsw m0, m10
+ shl wd, 2
+ mov tlq, rsp
+ sub rsp, wq
+ packuswb m0, m0
+ punpcklbw m0, m3
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+ pshufb m0, [base+pb_15to0]
+ mova [tlq], m0
+.h4_upsample_loop:
+ lea r4d, [r5+dyq]
+ shr r5d, 6
+ movq m0, [tlq+r5]
+ lea r5d, [r4+dyq]
+ shr r4d, 6
+ movhps m0, [tlq+r4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m5, m6
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jg .h4_upsample_loop
+ jmp .h4_transpose
+.h4_no_upsample:
+ mov r4d, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea r4d, [wq+3]
+ movd m0, r4d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r4d, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ movu m2, [tlq-7]
+ imul r5d, 0x55555555
+ movu m3, [base+z_filter_s-2]
+ shr r5d, 30 ; filter_strength
+ mova m4, [base+z_upsample2]
+ movddup m5, [base+z_filter_k-8+r5*8+24*0]
+ movddup m6, [base+z_filter_k-8+r5*8+24*1]
+ movddup m7, [base+z_filter_k-8+r5*8+24*2]
+ pshufb m0, m2, m3
+ shufps m3, m4, q2121
+ pmaddubsw m1, m0, m5
+ pmaddubsw m0, m6
+ pshufb m5, m2, m3
+ pmaddubsw m3, m5, m6
+ pmaddubsw m5, m7
+ pshufb m2, m4
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m1, m3
+ paddw m0, m5
+ paddw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ lea r2d, [r4+2]
+ cmp wd, 4
+ cmovne r4d, r2d
+ pshufd m0, m0, q0000
+ lea tlq, [rsp+15]
+ packuswb m0, m1
+ mova [rsp], m0
+.h4_main:
+ movd m5, dyd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf_h4]
+ lea r5, [dyq+r4+63] ; ypos
+ pshufb m4, [base+pw_256]
+ psubw m4, m0 ; max_base_y
+ shl wd, 2
+ paddw m6, m5, m5
+ sub rsp, wq
+ punpcklqdq m5, m6
+.h4_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movq m0, [tlq+r5-4]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movhps m0, [tlq+r4-4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h4_transpose
+ test r5d, r5d
+ jg .h4_loop
+ packuswb m7, m7
+.h4_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h4_end_loop
+.h4_transpose:
+ mova m1, [base+z_transpose4]
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ lea r2, [strideq*3]
+ lea dstq, [dstq+org_wq-4]
+.h4_transpose_loop:
+ mova m0, [rsp]
+ add rsp, 16
+ pshufb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m2, m0, q1032
+ movd [dstq+strideq*1], m2
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ sub dstq, 4
+ sub org_wd, 4
+ jg .h4_transpose_loop
+ RET
+.h8:
+ lea r4d, [angleq+88]
+ and r4d, ~0x7f
+ or r4d, wd
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m4, [tlq-15]
+ and r4d, 4
+ movu m3, [tlq- 9]
+ movd m1, r4d
+ movu m2, [base+z_filter_s+2]
+ pxor m0, m0
+ movu m5, [base+z_filter_s+6]
+ movddup m7, [base+pb_36_m4]
+ pshufb m1, m0 ; w & 4
+ movu m0, [base+z_upsample1-4]
+ pmaxub m1, m0 ; clip 4x8
+ add dyd, dyd
+ pshufb m0, m4, m1
+ pmaddubsw m0, m7
+ pshufb m1, m4, m2
+ pmaddubsw m1, m7
+ pshufb m2, m3, [base+z_upsample1]
+ pmaddubsw m2, m7
+ pshufb m3, m5
+ pmaddubsw m3, m7
+ movd m5, dyd
+ neg dyq
+ paddw m1, m0
+ paddw m2, m3
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ shl wd, 3
+ lea tlq, [rsp+16]
+ pshufb m5, [base+pw_256]
+ sub rsp, wq
+ packuswb m1, m2
+ lea r5, [dyq+63]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ mova [tlq-16*1], m0
+ mova [tlq-16*0], m1
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+.h8_upsample_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movu m0, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movu m1, [tlq+r4]
+ pand m3, m8, m5
+ psubw m2, m9, m3
+ psllw m2, 8
+ por m3, m2
+ pshufd m2, m3, q1010
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m1, m0
+ mova [rsp+wq-16], m1
+ sub wd, 16
+ jg .h8_upsample_loop
+ jmp .h8_transpose
+.h8_no_upsample:
+ lea r4d, [wq+7]
+ movd m0, r4d
+ and r4d, 7
+ or r4d, 8 ; imin(w+7, 15)
+ test angled, 0x400
+ jnz .h8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ mova m0, [tlq-15]
+ imul r5d, 0x55555555
+ movd m1, [tlq+1]
+ neg r4
+ movd m2, [tlq+r4]
+ shr r5d, 30
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ sub r5, 3 ; filter_strength-3
+ mova [tlq+16*0], m0
+ pshufb m1, m7
+ mova [tlq+16*1], m1
+ pshufb m2, m7
+ movq [tlq+r4+8], m2
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sar r5d, 1
+ add tlq, 31
+ add r5d, 17
+ cmp wd, 8
+ cmova r4d, r5d
+.h8_main:
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z3_base_inc]
+ shl wd, 3
+ mova m6, m5
+ sub rsp, wq
+.h8_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4-8]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h8_transpose
+ add r5, dyq
+ jg .h8_loop
+ packuswb m7, m7
+.h8_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h8_end_loop
+.h8_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 8
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+ mova m1, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*2
+ punpcklbw m2, m1, m0
+ punpckhbw m1, m0
+ punpckhbw m0, m1, m2
+ punpcklbw m1, m2
+.write_4x8_end:
+ call .write_4x8
+ RET
+.write_4x8:
+ movd [dstq+r2 ], m0
+ pshuflw m4, m0, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m0, m0
+ movd [dstq+strideq*1], m0
+ psrlq m0, 32
+ movd [dstq+strideq*0], m0
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+r2 ], m1
+ pshuflw m4, m1, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m1, m1
+ movd [dstq+strideq*1], m1
+ psrlq m1, 32
+ movd [dstq+strideq*0], m1
+ ret
+.h16:
+ lea r4d, [wq+15]
+ movd m0, r4d
+ and r4d, 15
+ or r4d, 16 ; imin(w+15, 31)
+ test angled, 0x400
+ jnz .h16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ mova m0, [tlq-16*2+1]
+ imul r5d, 0x24924924
+ mova m1, [tlq-16*1+1]
+ neg r4
+ movd m2, [tlq-16*0+1]
+ shr r5d, 30
+ movd m3, [tlq+r4]
+ adc r5, -4 ; filter_strength-3
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ mova [tlq-16*1], m0
+ pshufb m2, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ mova [tlq+16*1], m2
+ movq [tlq+r4+8], m3
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 31
+ cmp wd, 16
+ jle .h16_main
+ pshuflw m0, [tlq-47], q0000
+ sar r5, 1
+ movq m1, [base+z3_filter_k_tail+r5*4]
+ lea r4d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-35], m0
+.h16_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ shl wd, 4
+ mova m6, m5
+ sub rsp, wq
+.h16_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*2]
+ por m2, m1
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ psrlw m2, m5, 6
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jz .h16_transpose
+ add r5, dyq
+ jg .h16_loop
+.h16_end_loop:
+ mova [rsp+wq-16], m7
+ sub wd, 16
+ jg .h16_end_loop
+.h16_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 16
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+.h16_transpose_w4:
+ mova m2, [rsp+16*3]
+ mova m4, [rsp+16*2]
+ mova m3, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*4
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
+ punpckhwd m0, m1, m4
+ punpcklwd m1, m4
+ call .write_4x8
+ lea dstq, [dstq+strideq*4]
+ punpckhwd m0, m2, m3
+ punpcklwd m1, m2, m3
+ jmp .write_4x8_end
+.h32:
+ lea r4d, [wq+31]
+ and r4d, 31
+ or r4d, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 63
+ cmp wd, 32
+ jle .h32_main
+ pshuflw m0, [tlq-79], q0000
+ movq m1, [base+z3_filter_k_tail]
+ add r4d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-67], m0
+.h32_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h32_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*4]
+ por m2, m1
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 32
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h32_transpose
+ add r5, dyq
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ or r3d, 32
+ jmp .end_transpose_main
+.h64:
+ lea r4d, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mova m0, [tlq-16*8+1]
+ mova m1, [tlq-16*7+1]
+ mova m2, [tlq-16*6+1]
+ mova m3, [tlq-16*5+1]
+ mova [rsp+16*1], m0
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m2
+ mova [rsp+16*4], m3
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*8]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ cmp wd, 64
+ jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+.h64_filter96:
+ add tlq, 127
+.h64_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h64_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*8]
+ por m2, m1
+ movu m1, [tlq+r4-8*7]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 64
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*6]
+ movu m1, [tlq+r4-8*5]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*1], m0
+ movu m0, [tlq+r4-8*4]
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*2], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*3], m0
+ pand m0, m1, [rsp+16*2]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*2], m0
+ pand m0, m1, [rsp+16*1]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h64_transpose
+ add r5, dyq
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+16*3], m7
+ mova [rsp+16*2], m7
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ or r3d, 64
+.end_transpose_main:
+%if ARCH_X86_64
+ lea r5, [r3*3]
+ lea r7, [strideq*3]
+%else
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+.end_transpose_loop:
+ lea r4, [rsp+r3-8]
+ lea r6, [dstq+org_wq-8]
+.end_transpose_loop_y:
+ movq m0, [r4+r3*1]
+ movq m4, [r4+r3*0]
+%if ARCH_X86_64
+ movq m1, [r4+r5 ]
+ movq m5, [r4+r3*2]
+ lea r2, [r4+r3*4]
+%else
+ lea r2, [r4+r3*2]
+ movq m1, [r2+r3*1]
+ movq m5, [r2+r3*0]
+ lea r2, [r2+r3*2]
+%endif
+ movq m2, [r2+r3*1]
+ movq m6, [r2+r3*0]
+%if ARCH_X86_64
+ movq m3, [r2+r5 ]
+ movq m7, [r2+r3*2]
+%else
+ lea r2, [r2+r3*2]
+ movq m3, [r2+r3*1]
+ movq m7, [r2+r3*0]
+%endif
+ sub r4, 8
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ punpckhwd m4, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m3, m2
+ punpcklwd m3, m2
+ punpckhdq m2, m3, m1
+ punpckldq m3, m1
+ punpckldq m1, m0, m4
+ punpckhdq m0, m4
+ movhps [r6+strideq*0], m0
+ movq [r6+strideq*1], m0
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m1
+ movq [r6+r7 ], m1
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m1
+ movq [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+%endif
+ movhps [r6+strideq*0], m2
+ movq [r6+strideq*1], m2
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m3
+ movq [r6+r7 ], m3
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m3
+ movq [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+%endif
+ cmp r4, rsp
+ jae .end_transpose_loop_y
+ lea rsp, [rsp+r3*8]
+ sub org_wd, 8
+ jg .end_transpose_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
+; const uint8_t *idx, const int w, const int h);
+;---------------------------------------------------------------------------------------
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ mova m4, [palq]
+ LEA r2, pal_pred_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ packuswb m4, m4
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ pshufb m0, m4, [idxq]
+ add idxq, 16
+ movd [dstq ], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq ], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ add idxq, 32
+ movq [dstq ], m0
+ movhps [dstq+strideq ], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+strideq ], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r2 ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+ALIGN function_align
+.w64:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, strideq
+ sub hd, 1
+ jg .w64
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn wd, wm
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+16]
+ pcmpeqd m3, m3
+ psrlw m4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movd [dstq+strideq*0], m4
+ pshuflw m4, m4, q1032
+ movd [dstq+strideq*1], m4
+ punpckhqdq m4, m4
+ movd [dstq+strideq*2], m4
+ psrlq m4, 32
+ movd [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq ], m4
+ movhps [dstq+strideq ], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+16], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ movd m3, t0d
+ movd m2, r6d
+ psrld m3, m2
+ LEA t0, ipred_cfl_left_ssse3_table
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ LEA t0, ipred_cfl_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ movd m3, r6d
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ tzcnt wd, wm
+ movifnidn hd, hm
+ LEA r6, ipred_cfl_splat_ssse3_table
+ movsxd wq, [r6+wq*4]
+ movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
+ add wq, r6
+ movifnidn acq, acmp
+ jmp wq
+
+%macro RELOAD_ACQ_32 1
+ mov acq, ac_bakq ; restore acq
+%endmacro
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+DECLARE_REG_TMP 7
+ movddup m2, [pb_2]
+%else
+cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+DECLARE_REG_TMP 4
+%define ac_bakq acmp
+ mov t0d, 0x02020202
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m5, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m0, [yq]
+ movq m1, [yq+strideq]
+ movhps m0, [yq+strideq*2]
+ movhps m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4_8
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m0, [yq+strideq*2]
+ mova m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ jmp .w8_hpad
+.w8_wpad: ; wpadd=1
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 16
+ sub hd, 1
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 1
+ jg .w8_hpad
+ jmp .calc_avg_4_8
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, [yq+16]
+ mova m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, m0
+ punpckhqdq m6, m0, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ pshufhw m6, m0, q3333
+ punpckhqdq m6, m6
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ movddup m6, [yq+16]
+ movddup m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ pshufhw m6, m6, q3333
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg16
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ jmp .calc_avg16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4_8:
+ psrlw m2, 9
+ pmaddwd m4, m2
+ jmp .calc_avg
+.calc_avg16:
+ psrld m0, m4, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m4, m0
+.calc_avg:
+ movd szd, m5
+ psrad m5, 1
+ tzcnt r1d, szd
+ paddd m4, m5
+ movd m1, r1d
+ pshufd m0, m4, q2301
+ paddd m0, m4
+ pshufd m4, m0, q1032
+ paddd m0, m4
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0 ; ac[x] -= sum;
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m1, [yq]
+ movhps m1, [yq+strideq]
+ movq m0, [yq+strideq*2]
+ movhps m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ mova m1, [yq+strideq*2]
+ mova m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ mova m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movddup m1, [yq+strideq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movddup m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg_8_16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ pmaddwd m0, m4, m2
+ jmp .calc_avg
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+.calc_avg:
+ paddd m5, m0
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
+%define ac_bakq [rsp+16*4]
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ movifnidn hpadd, hpadm
+ movd m0, hpadd
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movd hpadd, m0
+ mov ac_bakq, acq
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m5, m5
+ pxor m4, m4
+ cmp wd, 16
+ jg .w32
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movd m1, [yq]
+ movd m3, [yq+strideq]
+ punpckldq m1, m3
+ punpcklbw m1, m1
+ movd m0, [yq+strideq*2]
+ movd m3, [yq+stride3q]
+ punpckldq m0, m3
+ punpcklbw m0, m0
+ pmaddubsw m1, m2
+ pmaddubsw m0, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m5, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movq m0, [yq+strideq]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ movq m1, [yq+strideq*2]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movq m0, [yq+stride3q]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movd m0, [yq+strideq]
+ punpcklbw m0, m0
+ punpcklqdq m0, m0
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m5, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movd m1, [yq+strideq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movq m1, [yq+strideq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+ paddd m5, m0
+ jmp .calc_avg
+
+.w32:
+ pxor m0, m0
+ mova [rsp ], m0
+ mova [rsp+16], m0
+ mova [rsp+32], m0
+ mova [rsp+48], m0
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg_32
+ jmp .w32_hpad_loop
+.w32_wpad:
+ cmp wpadd, 2
+ jl .w32_pad1
+ je .w32_pad2
+ cmp wpadd, 4
+ jl .w32_pad3
+ je .w32_pad4
+ cmp wpadd, 6
+ jl .w32_pad5
+ je .w32_pad6
+.w32_pad7:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ mova m0, m1
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad7
+ jmp .w32_wpad_done
+.w32_pad6:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ pshufhw m0, m1, q3333
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad6
+ jmp .w32_wpad_done
+.w32_pad5:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova m5, [rsp]
+ paddw m5, m1
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad5
+ jmp .w32_wpad_done
+.w32_pad4:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ pshufhw m3, m3, q3333
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad4
+ jmp .w32_wpad_done
+.w32_pad3:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ movd m3, [yq+16]
+ punpcklbw m3, m3
+ punpcklqdq m3, m3
+ pshufhw m3, m3, q3333
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad3
+ jmp .w32_wpad_done
+.w32_pad2:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, [yq+16]
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ pshufhw m4, m3, q3333
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad2
+ jmp .w32_wpad_done
+.w32_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ punpcklqdq m4, m4
+ pshufhw m4, m4, q3333
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad1
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_32
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova [acq+32], m3
+ mova [acq+48], m4
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ add acq, 64
+ sub hpadd, 1
+ jg .w32_hpad_loop
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+
+.calc_avg_32:
+ mova m5, [rsp]
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, [rsp+16]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ paddd m5, m0
+ mova m0, [rsp+32]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ mova m1, [rsp+48]
+ mova m3, m1
+ psrld m1, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m1, m3
+ paddd m1, m0
+ paddd m5, m1
+.calc_avg:
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+; %1 simd register that hold the mask and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location of the "false" values (simd register/memory)
+%macro BLEND 3 ; mask, true, false
+ pand %2, %1
+ pandn %1, %3
+ por %1, %2
+%endmacro
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3
+ pxor m0, m%1, m3
+ pand m0, m4
+ psubusb m2, m5, m1
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+%ifnum %2
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+%else
+ mova m0, %2
+ pminub m2, m0
+ pcmpeqb m0, m2
+%endif
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff
+ mova m2, m3
+ BLEND m0, m2, m%1
+ BLEND m1, m0, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
+%define base r5-ipred_paeth_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ pxor m0, m0
+ movd m5, [tlq]
+ pshufb m5, m0
+ LEA r5, ipred_paeth_ssse3_table
+ movsxd wq, [r5+wq*4]
+ movddup m4, [base+ipred_paeth_shuf]
+ add wq, r5
+ jmp wq
+.w4:
+ movd m6, [tlq+1] ; top
+ pshufd m6, m6, q0000
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 4
+ movd m3, [tlq]
+ mova m1, [base+ipred_h_shuf]
+ pshufb m3, m1 ; left
+ PAETH 6, 7
+ movd [dstq ], m1
+ pshuflw m0, m1, q1032
+ movd [dstq+strideq ], m0
+ punpckhqdq m1, m1
+ movd [dstq+strideq*2], m1
+ psrlq m1, 32
+ movd [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movddup m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 2
+ movd m3, [tlq]
+ pshufb m3, [base+ipred_paeth_shuf]
+ PAETH 6, 7
+ movq [dstq ], m1
+ movhps [dstq+strideq], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 1
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ PAETH 6, 7
+ mova [dstq], m1
+ add dstq, strideq
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+.w32_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, 7
+ mova [dstq+16], m1
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+ mova [rsp+48], m7
+ movu m6, [tlq+33]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+64], m6
+ mova [rsp+80], m7
+ movu m6, [tlq+49]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+96], m6
+.w64_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, [rsp+48]
+ mova [dstq+16], m1
+ mova m6, [rsp+64]
+ PAETH 6, [rsp+80]
+ mova [dstq+32], m1
+ mova m6, [rsp+96]
+ PAETH 6, 7
+ mova [dstq+48], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+
+%macro FILTER 4 ;dst, src, tmp, shuf
+%ifnum %4
+ pshufb m%2, m%4
+%else
+ pshufb m%2, %4
+%endif
+ pshufd m%1, m%2, q0000 ;p0 p1
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111 ;p2 p3
+ pmaddubsw m%3, m3
+ paddw m%1, [base+pw_8]
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222 ;p4 p5
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333 ;p6 __
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ packuswb m%1, m%1
+%endmacro
+
+cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
+%define base r6-$$
+ LEA r6, $$
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ lea filterq, [base+filter_intra_taps+filterq]
+ movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4
+ movsxd wq, [base+ipred_filter_ssse3_table+wq*4]
+ mova m2, [filterq+16*0]
+ mova m3, [filterq+16*1]
+ mova m4, [filterq+16*2]
+ mova m5, [filterq+16*3]
+ lea wq, [base+ipred_filter_ssse3_table+wq]
+ mov hd, hm
+ jmp wq
+.w4:
+ mova m1, [base+filter_shuf1]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ movd m0, [tlq+hq]
+ punpckldq m0, m6
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER 6, 0, 7, 1
+ movd [dstq+strideq*0], m6
+ pshuflw m6, m6, q1032
+ movd [dstq+strideq*1], m6
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+ALIGN function_align
+.w8:
+ movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4
+ sub tlq, 5
+ sub tlq, hq
+
+.w8_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER 0, 6, 1, [base+filter_shuf2]
+
+ punpckldq m6, m7, m0
+ movq [dstq+strideq*0], m6
+ punpckhqdq m6, m6
+ movq [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1] ;top row
+ sub tlq, 5
+ sub tlq, hq
+
+.w16_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1] ;top row
+ lea filterq, [tlq+17]
+ sub tlq, 5
+ sub tlq, hq
+
+.w32_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movu m1, [filterq]
+ punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
+ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ mova m6, m1
+
+ FILTER 7, 0, 6, [base+filter_shuf2]
+ punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+16+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m1, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+20+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+24+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+28+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+16+strideq*1], m6
+
+ mova m6, [dstq+strideq*1]
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+ lea filterq, [dstq+16+strideq*1]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h
new file mode 100644
index 0000000000..478eb6c6b6
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx.h
@@ -0,0 +1,363 @@
+/*
+ * Copyright © 2018-2023, VideoLAN and dav1d authors
+ * Copyright © 2018-2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
+decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx2_fns ( 8, 32, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
+decl_itx12_fns(16, 16, ext); \
+decl_itx2_fns (16, 32, ext); \
+decl_itx2_fns (32, 8, ext); \
+decl_itx2_fns (32, 16, ext); \
+decl_itx2_fns (32, 32, ext); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
+
+
+#define decl_itx2_bpc_fns(w, h, bpc, opt) \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt))
+
+#define decl_itx12_bpc_fns(w, h, bpc, opt) \
+decl_itx2_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt))
+
+#define decl_itx16_bpc_fns(w, h, bpc, opt) \
+decl_itx12_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt))
+
+#define decl_itx_bpc_fns(bpc, ext) \
+decl_itx16_bpc_fns( 4, 4, bpc, ext); \
+decl_itx16_bpc_fns( 4, 8, bpc, ext); \
+decl_itx16_bpc_fns( 4, 16, bpc, ext); \
+decl_itx16_bpc_fns( 8, 4, bpc, ext); \
+decl_itx16_bpc_fns( 8, 8, bpc, ext); \
+decl_itx16_bpc_fns( 8, 16, bpc, ext); \
+decl_itx2_bpc_fns ( 8, 32, bpc, ext); \
+decl_itx16_bpc_fns(16, 4, bpc, ext); \
+decl_itx16_bpc_fns(16, 8, bpc, ext); \
+decl_itx12_bpc_fns(16, 16, bpc, ext); \
+decl_itx2_bpc_fns (16, 32, bpc, ext); \
+decl_itx2_bpc_fns (32, 8, bpc, ext); \
+decl_itx2_bpc_fns (32, 16, bpc, ext); \
+decl_itx2_bpc_fns (32, 32, bpc, ext); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
+
+decl_itx_fns(avx512icl);
+decl_itx_bpc_fns(10, avx512icl);
+decl_itx_fns(avx2);
+decl_itx_bpc_fns(10, avx2);
+decl_itx_bpc_fns(12, avx2);
+decl_itx_fns(sse4);
+decl_itx_fns(ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
+
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+
+#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
+
+#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext)
+
+#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext)
+
+#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext)
+
+#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn(, 4, 4, ssse3);
+ assign_itx16_fn(R, 4, 8, ssse3);
+ assign_itx16_fn(R, 8, 4, ssse3);
+ assign_itx16_fn(, 8, 8, ssse3);
+ assign_itx16_fn(R, 4, 16, ssse3);
+ assign_itx16_fn(R, 16, 4, ssse3);
+ assign_itx16_fn(R, 8, 16, ssse3);
+ assign_itx16_fn(R, 16, 8, ssse3);
+ assign_itx12_fn(, 16, 16, ssse3);
+ assign_itx2_fn (R, 8, 32, ssse3);
+ assign_itx2_fn (R, 32, 8, ssse3);
+ assign_itx2_fn (R, 16, 32, ssse3);
+ assign_itx2_fn (R, 32, 16, ssse3);
+ assign_itx2_fn (, 32, 32, ssse3);
+ assign_itx1_fn (R, 16, 64, ssse3);
+ assign_itx1_fn (R, 32, 64, ssse3);
+ assign_itx1_fn (R, 64, 16, ssse3);
+ assign_itx1_fn (R, 64, 32, ssse3);
+ assign_itx1_fn ( , 64, 64, ssse3);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+#if BITDEPTH == 16
+ if (bpc == 10) {
+ assign_itx16_fn(, 4, 4, sse4);
+ assign_itx16_fn(R, 4, 8, sse4);
+ assign_itx16_fn(R, 4, 16, sse4);
+ assign_itx16_fn(R, 8, 4, sse4);
+ assign_itx16_fn(, 8, 8, sse4);
+ assign_itx16_fn(R, 8, 16, sse4);
+ assign_itx16_fn(R, 16, 4, sse4);
+ assign_itx16_fn(R, 16, 8, sse4);
+ assign_itx12_fn(, 16, 16, sse4);
+ assign_itx2_fn (R, 8, 32, sse4);
+ assign_itx2_fn (R, 32, 8, sse4);
+ assign_itx2_fn (R, 16, 32, sse4);
+ assign_itx2_fn (R, 32, 16, sse4);
+ assign_itx2_fn (, 32, 32, sse4);
+ assign_itx1_fn (R, 16, 64, sse4);
+ assign_itx1_fn (R, 32, 64, sse4);
+ assign_itx1_fn (R, 64, 16, sse4);
+ assign_itx1_fn (R, 64, 32, sse4);
+ assign_itx1_fn (, 64, 64, sse4);
+ }
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx2);
+ assign_itx16_fn(R, 4, 8, avx2);
+ assign_itx16_fn(R, 4, 16, avx2);
+ assign_itx16_fn(R, 8, 4, avx2);
+ assign_itx16_fn( , 8, 8, avx2);
+ assign_itx16_fn(R, 8, 16, avx2);
+ assign_itx2_fn (R, 8, 32, avx2);
+ assign_itx16_fn(R, 16, 4, avx2);
+ assign_itx16_fn(R, 16, 8, avx2);
+ assign_itx12_fn( , 16, 16, avx2);
+ assign_itx2_fn (R, 16, 32, avx2);
+ assign_itx1_fn (R, 16, 64, avx2);
+ assign_itx2_fn (R, 32, 8, avx2);
+ assign_itx2_fn (R, 32, 16, avx2);
+ assign_itx2_fn ( , 32, 32, avx2);
+ assign_itx1_fn (R, 32, 64, avx2);
+ assign_itx1_fn (R, 64, 16, avx2);
+ assign_itx1_fn (R, 64, 32, avx2);
+ assign_itx1_fn ( , 64, 64, avx2);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 4, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+ } else {
+ assign_itx16_bpc_fn( , 4, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
+ assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
+ assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
+ assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
+ }
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx512icl); // no wht
+ assign_itx16_fn(R, 4, 8, avx512icl);
+ assign_itx16_fn(R, 4, 16, avx512icl);
+ assign_itx16_fn(R, 8, 4, avx512icl);
+ assign_itx16_fn( , 8, 8, avx512icl);
+ assign_itx16_fn(R, 8, 16, avx512icl);
+ assign_itx2_fn (R, 8, 32, avx512icl);
+ assign_itx16_fn(R, 16, 4, avx512icl);
+ assign_itx16_fn(R, 16, 8, avx512icl);
+ assign_itx12_fn( , 16, 16, avx512icl);
+ assign_itx2_fn (R, 16, 32, avx512icl);
+ assign_itx1_fn (R, 16, 64, avx512icl);
+ assign_itx2_fn (R, 32, 8, avx512icl);
+ assign_itx2_fn (R, 32, 16, avx512icl);
+ assign_itx2_fn ( , 32, 32, avx512icl);
+ assign_itx1_fn (R, 32, 64, avx512icl);
+ assign_itx1_fn (R, 64, 16, avx512icl);
+ assign_itx1_fn (R, 64, 32, avx512icl);
+ assign_itx1_fn ( , 64, 64, avx512icl);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
+ }
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm
new file mode 100644
index 0000000000..2315ec1e47
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_avx2.asm
@@ -0,0 +1,8599 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
+ dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
+idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
+idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5
+iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
+iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
+pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
+idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
+idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+%macro COEF_PAIR 2-3 0
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3
+dd -%2, -%2
+%define pd_%2_m%2 pd_%2
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1931
+COEF_PAIR 799, 3406
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 1
+COEF_PAIR 2896, 3784, 1
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4076, 3612
+COEF_PAIR 4091, 3973
+
+pd_8: dd 8
+pd_m601: dd -601
+pd_m1189: dd -1189
+pd_m1380: dd -1380
+pd_m2106: dd -2106
+pd_m2598: dd -2598
+pd_m2751: dd -2751
+pd_m3344: dd -3344
+pd_1024: dd 1024
+pd_1321: dd 1321
+pd_1448: dd 1448
+pd_1697: dd 1697
+pd_2482: dd 2482
+pd_3072: dd 3072 ; 1024 + 2048
+pd_3803: dd 3803
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+pd_6144: dd 6144 ; 2048 + 4096
+pd_17408: dd 17408 ; 1024 + 16384
+
+pixel_10bpc_max: times 2 dw 0x03ff
+pixel_12bpc_max: times 2 dw 0x0fff
+dconly_10bpc: times 2 dw 0x7c00
+dconly_12bpc: times 2 dw 0x7000
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+clip_20b_min: dd -0x80000
+clip_20b_max: dd 0x7ffff
+
+idct64_mul_16bpc:
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern deint_shuf
+cextern idct64_mul
+cextern pw_1697x8
+cextern pw_1697x16
+cextern pw_1567_3784
+cextern pw_m1567_m3784
+cextern pw_m3784_1567
+cextern pw_2896_2896
+cextern pw_m2896_2896
+cextern pw_5
+cextern pw_2048
+cextern pw_4096
+cextern pw_8192
+cextern pw_16384
+cextern pw_2896x8
+cextern pd_2048
+
+cextern idct_4x8_internal_8bpc_avx2.main
+cextern idct_4x16_internal_8bpc_avx2.main
+cextern idct_8x8_internal_8bpc_avx2.main
+cextern idct_8x16_internal_8bpc_avx2.main
+cextern idct_16x4_internal_8bpc_avx2.main
+cextern idct_16x8_internal_8bpc_avx2.main
+cextern idct_16x16_internal_8bpc_avx2.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
+
+cextern iadst_4x4_internal_8bpc_avx2.main
+cextern iadst_4x8_internal_8bpc_avx2.main_pass2
+cextern iadst_4x16_internal_8bpc_avx2.main2
+cextern iadst_8x4_internal_8bpc_avx2.main
+cextern iadst_8x8_internal_8bpc_avx2.main_pass2
+cextern iadst_8x16_internal_8bpc_avx2.main
+cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
+cextern iadst_16x4_internal_8bpc_avx2.main
+cextern iadst_16x8_internal_8bpc_avx2.main
+cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
+cextern iadst_16x16_internal_8bpc_avx2.main
+cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro WRAP_XMM 1+
+ INIT_XMM cpuname
+ %1
+ INIT_YMM cpuname
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+ ; m0 = in0 in2, m1 = in1 in3
+ psubd m2, m0, m1 ; t2
+ paddd xm0, xm1 ; t0
+ vpermq m2, m2, q3322
+ vpermq m0, m0, q1100
+ vpermq m1, m1, q3120
+ psubd m3, m0, m2
+ psrad m3, 1
+ psubd m3, m1 ; t1 t3
+ psubd m0, m3 ; ____ out0
+ paddd m2, m3 ; out3 ____
+%endmacro
+
+INIT_YMM avx2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
+ mova xm0, [cq+16*0]
+ vinserti128 m0, [cq+16*2], 1
+ mova xm1, [cq+16*1]
+ vinserti128 m1, [cq+16*3], 1
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ lea r6, [dstq+strideq*2]
+ psrad m0, 2
+ psrad m1, 2
+ IWHT4_1D_PACKED
+ punpckhdq m0, m3
+ punpckldq m3, m2
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ IWHT4_1D_PACKED
+ vpblendd m0, m2, 0x33
+ packssdw m0, m3
+ vextracti128 xm2, m0, 1
+ punpckhdq xm1, xm0, xm2 ; out2 out1
+ punpckldq xm0, xm2 ; out3 out0
+ movq xm2, [r6 +strideq*1]
+ movhps xm2, [dstq+strideq*0]
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [dstq+strideq*1]
+%ifidn bdmaxd, bdmaxm
+ movd xm5, bdmaxd
+ vpbroadcastw xm5, xm5
+%else ; win64: load from stack
+ vpbroadcastw xm5, bdmaxm
+%endif
+ paddsw xm0, xm2
+ paddsw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movq [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm0
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = packed, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %9 & 1
+ vbroadcasti128 m%3, [pd_%8]
+%else
+ vpbroadcastd m%3, [pd_%8]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %9 & 1
+ vbroadcasti128 m%5, [pd_%7]
+%else
+ vpbroadcastd m%5, [pd_%7]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+ psubd m%1, m%3
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
+cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_%5bpc)
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
+ add r6d, 128
+ sar r6d, 8
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm2
+ vpbroadcastw xm0, xm0
+.dconly_loop:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ paddsw xm1, xm0
+ psubusw xm1, xm2
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ WRAP_XMM RET
+%else
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
+ ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
+ punpckhqdq m%3, m%2, m%1 ; t3 t2
+ punpcklqdq m%2, m%1 ; t0 t1
+ paddd m%1, m%2, m%3 ; out0 out1
+ psubd m%2, m%3 ; out3 out2
+%endmacro
+
+%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
+ vpbroadcastd m%5, [pw_m3784_1567]
+ punpckhwd m%3, m%2, m%1
+ vpbroadcastd m%4, [pw_1567_3784]
+ punpcklwd m%2, m%1
+ vpbroadcastd m%1, [pw_m2896_2896]
+ pmaddwd m%5, m%3
+ pmaddwd m%3, m%4
+ vpbroadcastd m%4, [pw_2896_2896]
+ pmaddwd m%1, m%2
+ pmaddwd m%2, m%4
+ REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
+ REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
+ packssdw m%3, m%5 ; t3 t2
+ packssdw m%2, m%1 ; t0 t1
+ paddsw m%1, m%2, m%3 ; out0 out1
+ psubsw m%2, m%3 ; out3 out2
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call .main
+ vbroadcasti128 m2, [idct4_shuf]
+ packssdw m0, m1
+ pshufb m0, m2
+ jmp tx2q
+.pass2:
+ vextracti128 xm1, m0, 1
+ WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
+ packssdw xm5, xm5 ; pw_2048
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movq xm3, [r6 +strideq*1]
+ movhps xm3, [r6 +strideq*0]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movhps [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm1
+ RET
+ALIGN function_align
+.main:
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m5, [pd_2048]
+.main2:
+ IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5
+ ret
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+%macro IADST4_1D 0
+ vpbroadcastd m5, [pd_1321]
+ vpbroadcastd m7, [pd_2482]
+ pmulld m4, m0, m5 ; 1321*in0
+ pmulld m6, m3, m7 ; 2482*in3
+ paddd m4, m6 ; 1321*in0 + 2482*in3
+ pmulld m6, m0, m7 ; 2482*in0
+ paddd m0, m3 ; in0 + in3
+ paddd m7, m5 ; pd_3803
+ pmulld m5, m2 ; 1321*in2
+ pmulld m3, m7 ; 3803*in3
+ pmulld m7, m2 ; 3803*in2
+ psubd m2, m0 ; in2 - in0 - in3
+ vpbroadcastd m0, [pd_m3344]
+ pmulld m1, m0 ; -t3
+ pmulld m2, m0 ; out2 (unrounded)
+ psubd m6, m5 ; 2482*in0 - 1321*in2
+ paddd m4, m7 ; t0
+ psubd m6, m3 ; t1
+ paddd m3, m4, m6
+ psubd m4, m1 ; out0 (unrounded)
+ psubd m6, m1 ; out1 (unrounded)
+ paddd m3, m1 ; out3 (unrounded)
+%endmacro
+
+cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call .main
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_2048]
+ mova m2, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ packssdw m0, m1
+ vpermd m0, m2, m0
+ psrld m2, 4
+ pshufb m0, m2
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ jmp tx2q
+.pass2:
+ lea r6, [deint_shuf+128]
+ vextracti128 xm1, m0, 1
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ vpbroadcastd xm4, [pw_2048]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [r6 +strideq*1]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pmulhrsw xm0, xm4
+ pmulhrsw xm1, xm4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ RET
+ALIGN function_align
+.main:
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+%if WIN64
+ movaps [rsp+16], xmm6
+ movaps [rsp+32], xmm7
+%endif
+.main2:
+ WRAP_XMM IADST4_1D
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_10bpc).pass1_end
+.pass2:
+ lea r6, [deint_shuf+128]
+ vextracti128 xm1, m0, 1
+ call m(iadst_4x4_internal_8bpc).main
+ vpbroadcastd xm4, [pw_2048]
+ movq xm3, [dstq+strideq*1]
+ movhps xm3, [dstq+strideq*0]
+ lea r6, [dstq+strideq*2]
+ movq xm2, [r6 +strideq*1]
+ movhps xm2, [r6 +strideq*0]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pmulhrsw xm0, xm4
+ pmulhrsw xm1, xm4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [r6 +strideq*0], xm0
+ movq [r6 +strideq*1], xm0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [pd_5793]
+ pmulld m0, m1, [cq+32*0]
+ pmulld m1, [cq+32*1]
+ vpbroadcastd m5, [pd_2048]
+ mova m3, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ packssdw m0, m1
+ vpermd m0, m3, m0
+ psrld m3, 4
+ pshufb m0, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m1, [pw_1697x8]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ pmulhrsw m1, m0
+ paddsw m0, m1
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [r6 +strideq*1]
+ vpbroadcastd xm4, [pixel_10bpc_max]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pxor m5, m5
+ mova [cq+32*0], m5
+ mova [cq+32*1], m5
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm5
+ pmaxsw xm1, xm5
+ pminsw xm0, xm4
+ pminsw xm1, xm4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ RET
+
+INV_TXFM_4X4_FN dct, dct, 12
+INV_TXFM_4X4_FN dct, identity, 12
+INV_TXFM_4X4_FN dct, adst, 12
+INV_TXFM_4X4_FN dct, flipadst, 12
+
+cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(idct_4x4_internal_10bpc).main
+ mova m3, [idct4_12_shuf]
+ mova m4, [idct4_12_shuf2]
+ vpermd m2, m4, m1
+ vpermd m1, m3, m0
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
+.pass2:
+ vpbroadcastd m5, [pd_2048]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_4x4_internal_10bpc).main2
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_4x4_internal_12bpc).end
+
+INV_TXFM_4X4_FN adst, dct, 12
+INV_TXFM_4X4_FN adst, adst, 12
+INV_TXFM_4X4_FN adst, flipadst, 12
+INV_TXFM_4X4_FN adst, identity, 12
+
+cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m1, m4, xm6, 1
+ vinserti128 m2, xm3, 1
+.pass1_end:
+ mova m3, [itx4_shuf]
+ vpbroadcastd m5, [pd_1024]
+ psrad m1, 1
+ psrad m2, 1
+ vpermd m1, m3, m1
+ vpermd m2, m3, m2
+ paddd m1, m5
+ paddd m2, m5
+ psrad m1, 11
+ psrad m2, 11
+.pass1_end2:
+ vpbroadcastd m3, [clip_18b_min]
+ vpbroadcastd m4, [clip_18b_max]
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pmaxsd m0, m3
+ pmaxsd m1, m3
+ pminsd m0, m4
+ pminsd m1, m4
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass2_end:
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+.end:
+%if WIN64
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign xmm_regs_used 6
+%endif
+.end2:
+ vpbroadcastd m4, [pw_16384]
+ movq xm2, [dstq+strideq*0]
+ movq xm3, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movhps xm2, [r6 +strideq*0] ; dst0 dst2
+ movhps xm3, [r6 +strideq*1] ; dst1 dst3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ vinserti128 m2, xm3, 1
+ psrad m0, 3
+ psrad m1, 3
+ packssdw m0, m1 ; t0 t2 t1 t3
+ pmulhrsw m0, m4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw m0, m2 ; out0 out2 out1 out3
+ pmaxsw m0, m4
+ pminsw m0, m5
+ vextracti128 xm1, m0, 1 ; out1 out3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [r6 +strideq*0], xm0
+ movhps [r6 +strideq*1], xm1
+ RET
+.main_pass2:
+ vextracti128 xm3, m1, 1
+ mova xm2, xm1
+ vextracti128 xm1, m0, 1
+ jmp m(iadst_4x4_internal_10bpc).main2
+
+INV_TXFM_4X4_FN flipadst, dct, 12
+INV_TXFM_4X4_FN flipadst, adst, 12
+INV_TXFM_4X4_FN flipadst, flipadst, 12
+INV_TXFM_4X4_FN flipadst, identity, 12
+
+cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m1, m3, xm2, 1
+ vinserti128 m2, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass1_end
+.pass2:
+ call m(iadst_4x4_internal_12bpc).main_pass2
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass2_end
+
+INV_TXFM_4X4_FN identity, dct, 12
+INV_TXFM_4X4_FN identity, adst, 12
+INV_TXFM_4X4_FN identity, flipadst, 12
+INV_TXFM_4X4_FN identity, identity, 12
+
+cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ mova m2, [itx4_shuf]
+ vpbroadcastd m3, [pd_1697]
+ vpermd m0, m2, [cq+32*0]
+ vpermd m2, m2, [cq+32*1]
+ vpbroadcastd m5, [pd_2048]
+ pmulld m1, m3, m0
+ pmulld m3, m2
+ paddd m1, m5
+ paddd m3, m5
+ psrad m1, 12
+ psrad m3, 12
+ paddd m1, m0
+ paddd m2, m3
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ vpbroadcastd m3, [pd_5793]
+ vpbroadcastd m5, [pd_2048]
+ pmulld m0, m3
+ pmulld m1, m3
+ paddd m0, m5 ; 2048
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ jmp m(iadst_4x4_internal_12bpc).end
+
+%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x8, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
+%else
+ jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
+ vpbroadcastd m%5, [pd_2896]
+ pmulld m%1, m%5
+ pmulld m%3, m%5
+ paddd m%1, m%8
+ paddd m%5, m%1, m%3
+ psubd m%1, m%3
+ psrad m%5, 12 ; t0
+ psrad m%1, 12 ; t1
+ psubd m%3, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%5, m%4
+ psubd m%4, m%5, m%4
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m3, [pd_2896]
+ pmulld m0, m3, [cq+32*0]
+ pmulld m1, m3, [cq+32*1]
+ pmulld m2, m3, [cq+32*2]
+ pmulld m3, m3, [cq+32*3]
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ packssdw m0, m2
+ packssdw m1, m3
+ lea r6, [deint_shuf+128]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ; 2 3
+ punpckldq m0, m2 ; 0 1
+ vextracti128 xm2, m0, 1 ; 4 5
+ vextracti128 xm3, m1, 1 ; 6 7
+ call m(idct_4x8_internal_8bpc).main
+ vpbroadcastd xm4, [pw_2048]
+ REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+r3 ]
+ movhps xm5, [dstq+strideq*2]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ movq xm7, [r6 +r3 ]
+ movhps xm7, [r6 +strideq*2]
+ paddw xm0, xm4 ; 0 1
+ paddw xm1, xm5 ; 3 2
+ paddw xm2, xm6 ; 4 5
+ paddw xm3, xm7 ; 7 6
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
+ REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movhps [r6 +strideq*2], xm3
+ movq [r6 +r3 ], xm3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass1_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ mova xm4, [pw_2048_m2048]
+ REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
+.end:
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+strideq*2]
+ movhps xm5, [dstq+r3 ]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ movq xm7, [r6 +strideq*2]
+ movhps xm7, [r6 +r3 ]
+ paddw xm0, xm4 ; 0 1
+ paddw xm1, xm5 ; 2 3
+ paddw xm2, xm6 ; 4 5
+ paddw xm3, xm7 ; 6 7
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
+ REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ RET
+ALIGN function_align
+.pass2_main:
+ packssdw m0, m2
+ packssdw m1, m3
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpckhdq m5, m4, m0
+ punpckldq m4, m0
+ vextracti128 xm2, m4, 1 ; 4 5
+ vextracti128 xm3, m5, 1 ; 6 7
+ pshufd xm4, xm4, q1032 ; 1 0
+ pshufd xm5, xm5, q1032 ; 3 2
+ jmp m(iadst_4x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.main2:
+ vbroadcasti128 m0, [cq+16*0]
+ vbroadcasti128 m2, [cq+16*2]
+ vbroadcasti128 m3, [cq+16*5]
+ vbroadcasti128 m1, [cq+16*7]
+ vpbroadcastd m6, [pd_2896]
+ shufpd m0, m2, 0x0c ; 0 2
+ shufpd m1, m3, 0x0c ; 7 5
+ vbroadcasti128 m2, [cq+16*4]
+ vbroadcasti128 m4, [cq+16*6]
+ vbroadcasti128 m5, [cq+16*1]
+ vbroadcasti128 m3, [cq+16*3]
+ vpbroadcastd m7, [pd_2048]
+ shufpd m2, m4, 0x0c ; 4 6
+ shufpd m3, m5, 0x0c ; 3 1
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+.main3:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m8}, m4, m2, m0, m1
+ REPX {pminsd x, m9}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ vpblendd m4, m2, 0xcc ; t4 t7
+ vpblendd m2, m5, 0xcc ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784
+ vpbroadcastd m5, [pd_2896]
+ vbroadcasti128 m6, [pw_2048_m2048] ; + + - -
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m8}, m1, m2
+ REPX {pminsd x, m9}, m1, m2
+ vpblendd m3, m1, m2, 0xcc
+ shufpd m1, m2, 0x05
+ pmulld m3, m5
+ pmulld m5, m1
+ psignd m0, m6 ; out0 out7
+ psignd m4, m6 ; out6 out1
+ paddd m3, m7
+ psubd m2, m3, m5
+ paddd m5, m3
+ psrad m2, 12 ; out4 -out5
+ psrad m5, 12 ; -out3 out2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m2, m5, m6
+ paddd m3, m5, m4
+ jmp m(iadst_4x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_4x8_internal_10bpc).pass2_main
+ mova xm4, [pw_2048_m2048]
+ REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*1]
+ movhps xm4, [dstq+strideq*0]
+ movq xm5, [dstq+r3 ]
+ movhps xm5, [dstq+strideq*2]
+ movq xm6, [r6 +strideq*1]
+ movhps xm6, [r6 +strideq*0]
+ movq xm7, [r6 +r3 ]
+ movhps xm7, [r6 +strideq*2]
+ paddw xm3, xm4 ; 1 0
+ paddw xm2, xm5 ; 3 2
+ paddw xm1, xm6 ; 5 4
+ paddw xm0, xm7 ; 7 6
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
+ REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0
+ movhps [dstq+strideq*0], xm3
+ movq [dstq+strideq*1], xm3
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r3 ], xm2
+ movhps [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm1
+ movhps [r6 +strideq*2], xm0
+ movq [r6 +r3 ], xm0
+ RET
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m3, [pd_2896]
+ pmulld m0, m3, [cq+32*0]
+ pmulld m1, m3, [cq+32*1]
+ pmulld m2, m3, [cq+32*2]
+ pmulld m3, [cq+32*3]
+ vpbroadcastd m5, [pd_2048]
+ vpbroadcastd m4, [pd_5793]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m6, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ vpbroadcastd m4, [pw_4096]
+ packssdw m0, m2
+ packssdw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m4
+ pmulhrsw m0, m4
+ punpckhdq m1, m0, m2 ; 2 3 6 7
+ punpckldq m0, m2 ; 0 1 4 5
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ vpbroadcastq m4, [r6 +strideq*0]
+ vpbroadcastq m5, [r6 +strideq*1]
+ movq xm3, [dstq+strideq*2]
+ movhps xm3, [dstq+r3 ]
+ vpblendd m2, m4, 0x30
+ vpblendd m2, m5, 0xc0
+ vpbroadcastq m4, [r6 +strideq*2]
+ vpbroadcastq m5, [r6 +r3 ]
+ vpblendd m3, m4, 0x30
+ vpblendd m3, m5, 0xc0
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2 ; out0 out1 out4 out5
+ paddw m1, m3 ; out2 out3 out6 out7
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m6
+ pminsw m1, m6
+ vextracti128 xm2, m0, 1 ; out4 out5
+ vextracti128 xm3, m1, 1 ; out6 out7
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ ret
+
+INV_TXFM_4X8_FN dct, dct, 12
+INV_TXFM_4X8_FN dct, identity, 12
+INV_TXFM_4X8_FN dct, adst, 12
+INV_TXFM_4X8_FN dct, flipadst, 12
+
+cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(idct_4x8_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ ; transpose & interleave
+ pshufd m0, m0, q1320
+ pshufd m1, m1, q1320
+ pshufd m2, m2, q1320
+ pshufd m3, m3, q1320
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ vpermq m0, m0, q3102
+ vpermq m2, m2, q3102
+ vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved)
+ vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved)
+ vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
+ vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
+ vpbroadcastd m7, [pd_2048]
+ call m(idct_8x4_internal_10bpc).main
+ psubd m3, m0, m4 ; out7 out6
+ paddd m0, m4 ; out0 out1
+ paddd m1, m2, m5 ; out3 out2
+ psubd m2, m5 ; out4 out5
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp m(iadst_4x8_internal_12bpc).end
+
+INV_TXFM_4X8_FN adst, dct, 12
+INV_TXFM_4X8_FN adst, adst, 12
+INV_TXFM_4X8_FN adst, flipadst, 12
+INV_TXFM_4X8_FN adst, identity, 12
+
+cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m4, 1
+ psrad m1, m6, 1
+ psrad m2, 1
+ psrad m3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_1024]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 11}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call .pass2_main
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+.end:
+ vpbroadcastd m4, [pw_16384]
+ REPX {psrad x, 3}, m0, m1, m2, m3
+ packssdw m0, m2 ; 0 1 4 5 (interleaved)
+ packssdw m1, m3 ; 2 3 6 7 (interleaved)
+ mova m2, [iadst8_12_shuf]
+ vpermd m0, m2, m0 ; 0 1 4 5
+ vpermd m1, m2, m1 ; 2 3 6 7
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+strideq*2]
+ movhps xm5, [dstq+r3 ]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ vinserti128 m4, xm6, 1
+ movq xm7, [r6 +strideq*2]
+ movhps xm7, [r6 +r3 ]
+ vinserti128 m5, xm7, 1
+ paddw m0, m4 ; 0 1 4 5
+ paddw m1, m5 ; 2 3 6 7
+ vpbroadcastd m5, [pixel_12bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, m4}, m0, m1
+ REPX {pminsw x, m5}, m0, m1
+ vextracti128 xm2, m0, 1 ; out4 out5
+ vextracti128 xm3, m1, 1 ; out6 out7
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ RET
+ALIGN function_align
+.pass2_main:
+ ; transpose & interleave
+ pshufd m0, m0, q1320
+ pshufd m1, m1, q1320
+ pshufd m2, m2, q1320
+ pshufd m3, m3, q1320
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved)
+ vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved)
+ vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
+ vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
+ vpbroadcastd m7, [pd_2048]
+ jmp m(iadst_4x8_internal_10bpc).main3
+
+INV_TXFM_4X8_FN flipadst, dct, 12
+INV_TXFM_4X8_FN flipadst, adst, 12
+INV_TXFM_4X8_FN flipadst, flipadst, 12
+INV_TXFM_4X8_FN flipadst, identity, 12
+
+cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m3, 1
+ psrad m1, m2, 1
+ psrad m2, m6, 1
+ psrad m3, m4, 1
+ jmp m(iadst_4x8_internal_12bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_4x8_internal_12bpc).pass2_main
+ shufpd m3, m4, m0, 0x05 ; out1 out0
+ shufpd m0, m4, 0x05 ; out7 out6
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032 ; out5 out4
+ psignd m2, m5, m6 ; out3 out2
+ jmp m(iadst_4x8_internal_12bpc).end
+
+INV_TXFM_4X8_FN identity, dct, 12
+INV_TXFM_4X8_FN identity, adst, 12
+INV_TXFM_4X8_FN identity, flipadst, 12
+INV_TXFM_4X8_FN identity, identity, 12
+
+cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(iidentity_4x8_internal_10bpc).pass1
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m2 = in4 in5
+ ; m3 = in6 in7
+ vpbroadcastd m6, [pixel_12bpc_max]
+ call m(iidentity_4x8_internal_10bpc).pass2_end
+ RET
+
+%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x16, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd xm2, [dconly_%3bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m10, [pd_3072]
+ mova m1, [cq+32*2]
+ mova m3, [cq+32*6]
+ mova m5, [cq+32*3]
+ mova m7, [cq+32*7]
+ call .pass1_main
+ pmulld m0, m6, [cq+32*0]
+ pmulld m2, m6, [cq+32*4]
+ pmulld m4, m6, [cq+32*1]
+ pmulld m6, [cq+32*5]
+ call .pass1_main2
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m4 ; 2 3
+ punpckldq m0, m4 ; 0 1
+ punpckldq m4, m5, m2 ; 8 9
+ punpckhdq m5, m2 ; a b
+ vextracti128 xm2, m0, 1 ; 4 5
+ vextracti128 xm3, m1, 1 ; 6 7
+ vextracti128 xm6, m4, 1 ; c d
+ vextracti128 xm7, m5, 1 ; e f
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m9, [pw_2048]
+ vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
+ vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
+ vinserti128 m2, m4, xm5, 1 ; 8 9 b a
+ vinserti128 m3, m6, xm7, 1 ; c d f e
+ vpbroadcastd m8, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass1_main:
+ vpbroadcastd m4, [pd_3784]
+ vpbroadcastd m8, [pd_1567]
+ vpbroadcastd m9, [pd_2048]
+ vpbroadcastd m6, [pd_1448]
+ ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
+ ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
+ ret
+ALIGN function_align
+.pass1_main2:
+ paddd m0, m10
+ paddd m4, m10
+ paddd m8, m0, m2
+ psubd m0, m2
+ paddd m9, m4, m6
+ psubd m4, m6
+ REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
+ psubd m2, m0, m1
+ paddd m1, m0
+ psubd m6, m4, m5
+ paddd m5, m4
+ paddd m0, m8, m3
+ psubd m3, m8, m3
+ paddd m4, m9, m7
+ psubd m7, m9, m7
+ ret
+ALIGN function_align
+.pass2_end:
+ lea r6, [strideq*3]
+ pxor m7, m7
+ pmulhrsw m0, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ ret
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ vpbroadcastq m5, [dstq+strideq*2]
+ vpbroadcastq m6, [dstq+r6 ]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0xc0
+ vpblendd m4, m6, 0x30
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+strideq*2], xm5
+ movq [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_6144]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m4, 13
+ psrad m1, m5, 13
+ psrad m2, 13
+ psrad m3, 13
+ psrad m4, m8, 13
+ psrad m5, m9, 13
+ psrad m6, 13
+ psrad m7, 13
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd m8, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1
+ pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
+ vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13
+ pxor m7, m7
+ psubw m9, m7, m5
+ vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
+ pmulhrsw m0, m4, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ RET
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+r6 ]
+ movhps xm4, [dstq+strideq*0]
+ vpbroadcastq m5, [dstq+strideq*1]
+ vpbroadcastq m6, [dstq+strideq*2]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0xc0
+ vpblendd m4, m6, 0x30
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movhps [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm5
+ movq [dstq+strideq*2], xm5
+ movq [dstq+r6 ], xm4
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.pass2_main:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m4
+ punpckldq m0, m4
+ punpckldq m4, m5, m2
+ punpckhdq m5, m2
+ vpblendd m3, m0, m1, 0x33
+ vpblendd m0, m1, 0xcc
+ shufpd m2, m5, m4, 0x05
+ shufpd m4, m5, 0x05
+ vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5
+ vinserti128 m0, xm3, 1 ; 0 3 2 1
+ vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
+ vinserti128 m2, xm4, 1 ; b 8 9 a
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m5, [pw_2896x8]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ ret
+ALIGN function_align
+.main:
+ vbroadcasti128 m0, [cq+16* 0]
+ vbroadcasti128 m4, [cq+16* 2]
+ vbroadcasti128 m1, [cq+16*15]
+ vbroadcasti128 m5, [cq+16*13]
+ vbroadcasti128 m2, [cq+16* 4]
+ vbroadcasti128 m6, [cq+16* 6]
+ vbroadcasti128 m3, [cq+16*11]
+ vbroadcasti128 m7, [cq+16* 9]
+ shufpd m0, m4, 0x0c ; 0 2
+ shufpd m1, m5, 0x0c ; 15 13
+ shufpd m2, m6, 0x0c ; 4 6
+ shufpd m3, m7, 0x0c ; 11 9
+ vbroadcasti128 m4, [cq+16* 8]
+ vbroadcasti128 m6, [cq+16*10]
+ vbroadcasti128 m5, [cq+16* 7]
+ vbroadcasti128 m7, [cq+16* 5]
+ shufpd m4, m6, 0x0c ; 8 10
+ shufpd m5, m7, 0x0c ; 7 5
+ vbroadcasti128 m6, [cq+16*12]
+ vbroadcasti128 m7, [cq+16*14]
+ shufpd m6, m7, 0x0c ; 12 14
+ vbroadcasti128 m7, [cq+16* 3]
+ vbroadcasti128 m8, [cq+16* 1]
+ shufpd m7, m8, 0x0c ; 3 1
+.main2:
+ ; expects: m12 = clip_min m13 = clip_max
+ vpbroadcastd m11, [pd_2048]
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
+ ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1
+ ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10
+ vpbroadcastd m10, [pd_2896]
+ vbroadcasti128 m9, [pw_2048_m2048] ; + + - -
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m12}, m6, m5, m3, m4
+ REPX {pminsd x, m13}, m6, m5, m3, m4
+ REPX {pmulld x, m10}, m6, m5, m3, m4
+ paddd m6, m11
+ paddd m4, m11
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {psignd x, m9}, m1, m8, m3, m6
+ pshufd m9, m9, q1032
+ REPX {psignd x, m9}, m0, m7, m2, m5
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+.pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_6144]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m3, 13
+ psrad m1, m2, 13
+ psrad m2, m5, 13
+ psrad m3, m4, 13
+ psrad m4, m7, 13
+ psrad m5, m6, 13
+ psrad m6, m9, 13
+ psrad m7, m8, 13
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_10bpc).pass2_main
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd m8, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2
+ pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
+ vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14
+ pxor m7, m7
+ psubw m9, m7, m5
+ vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
+ pmulhrsw m0, m4, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ RET
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+r6 ]
+ vpbroadcastq m5, [dstq+strideq*1]
+ vpbroadcastq m6, [dstq+strideq*2]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0x30
+ vpblendd m4, m6, 0xc0
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+ movhps [dstq+strideq*2], xm5
+ movhps [dstq+r6 ], xm4
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [pd_5793]
+ pmulld m0, m7, [cq+32*0]
+ pmulld m4, m7, [cq+32*1]
+ pmulld m1, m7, [cq+32*2]
+ pmulld m5, m7, [cq+32*3]
+ pmulld m2, m7, [cq+32*4]
+ pmulld m6, m7, [cq+32*5]
+ pmulld m3, m7, [cq+32*6]
+ pmulld m7, [cq+32*7]
+ vpbroadcastd m8, [pd_6144]
+ REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
+ REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp tx2q
+.pass2:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m7, [pw_1697x16]
+ vpbroadcastd m8, [pw_2048]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m4, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ lea r6, [strideq*5]
+ pxor m3, m3
+ punpckhdq m5, m0, m2 ; 2 3 6 7
+ punpckldq m0, m2 ; 0 1 4 5
+ punpckldq m6, m7, m1 ; 8 9 c d
+ punpckhdq m7, m1 ; a b e f
+ pmulhrsw m0, m8
+ call .write_2x4x2
+ pmulhrsw m0, m5, m8
+ call .write_2x4x2
+ pmulhrsw m0, m6, m8
+ lea dstq, [dstq+strideq*4]
+ call .write_2x4x2
+ pmulhrsw m0, m7, m8
+ call .write_2x4x2
+ ret
+ALIGN function_align
+.write_2x4x2:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ vpbroadcastq m2, [dstq+strideq*4]
+ vpblendd m1, m2, 0x30
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m1, m2, 0xc0
+ mova [cq+32*0], m3
+ mova [cq+32*1], m3
+ add cq, 32*2
+ paddw m1, m0
+ pmaxsw m1, m3
+ pminsw m1, m4
+ vextracti128 xm2, m1, 1
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ movq [dstq+strideq*4], xm2
+ movhps [dstq+r6 ], xm2
+ lea dstq, [dstq+strideq*2]
+ ret
+
+INV_TXFM_4X16_FN dct, dct, 12
+INV_TXFM_4X16_FN dct, identity, 12
+INV_TXFM_4X16_FN dct, adst, 12
+INV_TXFM_4X16_FN dct, flipadst, 12
+
+cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ jmp m(idct_4x16_internal_10bpc).pass1
+.pass2:
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+ punpckldq m1, m4, m5
+ punpckhdq m4, m5
+ punpckldq m3, m6, m7
+ punpckhdq m6, m7
+ punpcklqdq m5, m0, m2 ; 2 6
+ punpckhqdq m12, m0, m2 ; 3 7
+ punpcklqdq m0, m8, m9 ; 0 4
+ punpckhqdq m10, m8, m9 ; 1 5
+ punpcklqdq m2, m1, m3 ; 8 12
+ punpckhqdq m13, m1, m3 ; 9 13
+ punpcklqdq m9, m4, m6 ; 10 14
+ punpckhqdq m4, m6 ; 11 15
+ vperm2i128 m1, m5, m9, 0x20 ; 2 10
+ vperm2i128 m3, m9, m5, 0x31 ; 14 6
+ vpermq m11, m4, q1302 ; 15 11
+ ; interleave
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
+ REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
+ call m(idct_16x4_internal_10bpc).pass1_main
+ vpermq m6, m12, q1302 ; 7 3
+ vpermq m5, m13, q3120 ; 9 13
+ call m(idct_16x4_internal_10bpc).pass1_main2
+ call m(idct_16x4_internal_10bpc).pass1_main3
+ REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ mova m4, [idct16_12_shuf]
+ REPX {vpermd x, m4, x}, m0, m1, m2, m3
+ vpbroadcastd m9, [pw_16384]
+ vpbroadcastd m8, [pixel_12bpc_max]
+ call m(idct_4x16_internal_10bpc).pass2_end
+ RET
+
+INV_TXFM_4X16_FN adst, dct, 12
+INV_TXFM_4X16_FN adst, adst, 12
+INV_TXFM_4X16_FN adst, flipadst, 12
+INV_TXFM_4X16_FN adst, identity, 12
+
+cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ call .main_pass1
+ psrad m0, m4, 12
+ psrad m1, m5, 12
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ psrad m6, 12
+ psrad m7, 12
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose_16x4
+ call m(iadst_4x16_internal_10bpc).main2
+ pshufd m4, m5, q1032
+ psrad m5, m6, 3
+ pshufd m6, m7, q1032
+ psrad m7, m8, 3
+ REPX {pshufd x, x, q1032}, m0, m2
+ REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6
+.pass2_end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ mova m4, [iadst16_12_shuf]
+ REPX {vpermd x, m4, x}, m0, m1, m2, m3
+ vpbroadcastd m9, [pw_16384]
+ vpbroadcastd m8, [pixel_12bpc_max]
+ lea r6, [strideq*3]
+ pxor m7, m7
+ pmulhrsw m0, m9
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m1
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m2
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m3
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ RET
+ALIGN function_align
+.transpose_16x4:
+ ; transpose & interleave
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+ punpckldq m1, m4, m5
+ punpckhdq m4, m5
+ punpckldq m3, m6, m7
+ punpckhdq m6, m7
+ punpcklqdq m10, m8, m0
+ punpckhqdq m0, m8
+ punpcklqdq m11, m9, m2
+ punpckhqdq m2, m9
+ punpcklqdq m8, m1, m4
+ punpckhqdq m4, m1
+ punpcklqdq m9, m3, m6
+ punpckhqdq m6, m3
+ vperm2i128 m5, m0, m2, 0x31 ; 7 5
+ vperm2i128 m7, m0, m2, 0x20 ; 3 1
+ vperm2i128 m0, m10, m11, 0x20 ; 0 2
+ vperm2i128 m2, m10, m11, 0x31 ; 4 6
+ vperm2i128 m1, m4, m6, 0x31 ; 15 13
+ vperm2i128 m3, m4, m6, 0x20 ; 11 9
+ vperm2i128 m4, m8, m9, 0x20 ; 8 10
+ vperm2i128 m6, m8, m9, 0x31 ; 12 14
+ ret
+ALIGN function_align
+.main_pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_3072]
+ paddd m10, m4, m5
+ psubd m4, m3
+ psubd m5, m3
+ paddd m3, m10
+ psubd m8, m7, m1
+ paddd m7, m9
+ psubd m9, m1
+ paddd m7, m1
+ REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
+ REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
+ paddd m6, m0
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct, 12
+INV_TXFM_4X16_FN flipadst, adst, 12
+INV_TXFM_4X16_FN flipadst, flipadst, 12
+INV_TXFM_4X16_FN flipadst, identity, 12
+
+cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ call m(iadst_4x16_internal_12bpc).main_pass1
+ psrad m0, m3, 12
+ psrad m1, m2, 12
+ psrad m2, m5, 12
+ psrad m3, m4, 12
+ psrad m4, m7, 12
+ psrad m5, m6, 12
+ psrad m6, m9, 12
+ psrad m7, m8, 12
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_4x16_internal_12bpc).transpose_16x4
+ call m(iadst_4x16_internal_10bpc).main2
+ pshufd m4, m3, q1032
+ psrad m3, m5, 3
+ psrad m5, m2, 3
+ pshufd m2, m6, q1032
+ pshufd m6, m1, q1032
+ psrad m1, m7, 3
+ psrad m7, m0, 3
+ pshufd m0, m8, q1032
+ REPX {psrad x, 3}, m0, m2, m4, m6
+ jmp m(iadst_4x16_internal_12bpc).pass2_end
+
+INV_TXFM_4X16_FN identity, dct, 12
+INV_TXFM_4X16_FN identity, adst, 12
+INV_TXFM_4X16_FN identity, flipadst, 12
+INV_TXFM_4X16_FN identity, identity, 12
+
+cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_1697]
+ mova m0, [cq+32*0]
+ mova m4, [cq+32*1]
+ mova m1, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m9, [pd_6144]
+ pmulld m2, m8, m0
+ pmulld m6, m8, m4
+ pmulld m3, m8, m1
+ pmulld m7, m8, m5
+ mova m10, [cq+32*4]
+ mova m11, [cq+32*5]
+ mova m12, [cq+32*6]
+ mova m13, [cq+32*7]
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m0, m2
+ pmulld m2, m8, m10
+ paddd m4, m6
+ pmulld m6, m8, m11
+ paddd m1, m3
+ pmulld m3, m8, m12
+ paddd m5, m7
+ pmulld m7, m8, m13
+ REPX {psrad x, 1 }, m0, m4, m1, m5
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m2, m10
+ paddd m6, m11
+ paddd m3, m12
+ paddd m7, m13
+ REPX {psrad x, 1 }, m2, m6, m3, m7
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_1024]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m8, [pw_16384]
+ vpbroadcastd m4, [pixel_12bpc_max]
+ call m(iidentity_4x16_internal_10bpc).pass2_end
+ RET
+
+%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 8x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+%else
+ jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.pass1:
+ vbroadcasti128 m1, [cq+16*1]
+ vbroadcasti128 m0, [cq+16*5]
+ vbroadcasti128 m2, [cq+16*3]
+ vbroadcasti128 m3, [cq+16*7]
+ vpbroadcastd m6, [pd_2896]
+ shufpd m1, m0, 0x0c ; 1 5
+ shufpd m3, m2, 0x0c ; 7 3
+ vbroadcasti128 m0, [cq+16*0]
+ vbroadcasti128 m4, [cq+16*2]
+ vbroadcasti128 m2, [cq+16*4]
+ vbroadcasti128 m5, [cq+16*6]
+ vpbroadcastd m7, [pd_2048]
+ shufpd m0, m4, 0x0c ; 0 2
+ shufpd m2, m5, 0x0c ; 4 6
+ REPX {pmulld x, m6}, m1, m3, m0, m2
+ REPX {paddd x, m7}, m1, m3, m0, m2
+ REPX {psrad x, 12}, m1, m3, m0, m2
+ call .main
+ psubd m3, m0, m4 ; out7 out6 (interleaved)
+ paddd m0, m4 ; out0 out1 (interleaved)
+ paddd m1, m2, m5 ; out3 out2 (interleaved)
+ psubd m2, m5 ; out4 out5 (interleaved)
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp tx2q
+.pass2:
+ vbroadcasti128 m4, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ vperm2i128 m1, m0, m2, 0x31
+ vinserti128 m0, xm2, 1
+ pshufb m0, m4
+ pshufb m1, m4
+ IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m2, m1, q2031 ; out2 out3
+ jmp m(iadst_8x4_internal_10bpc).end
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
+ IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7
+ vpbroadcastd m6, [pd_2896]
+ punpcklqdq m4, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m4, m1 ; t5a t6a
+ paddd m4, m1 ; t4 t7
+ REPX {pmaxsd x, m8}, m3, m4, m0, m2
+ REPX {pminsd x, m9}, m3, m4, m0, m2
+ pmulld m3, m6
+ pshufd m1, m3, q1032
+ paddd m3, m7
+ psubd m5, m3, m1
+ paddd m1, m3
+ psrad m5, 12
+ psrad m1, 12
+ vpblendd m5, m4, 0x33 ; t4 t5
+ punpckhqdq m4, m1 ; t7 t6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_10bpc).main
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m2, m1, q3120 ; out2 out3
+.end:
+ vpbroadcastd m1, [pw_2048]
+ pmulhrsw m0, m1
+ pmulhrsw m1, m2
+ vpbroadcastd m5, [pixel_10bpc_max]
+.end2:
+ mova xm2, [dstq+strideq*0]
+ vinserti128 m2, [dstq+strideq*1], 1
+ lea r6, [dstq+strideq*2]
+ mova xm3, [r6 +strideq*0]
+ vinserti128 m3, [r6 +strideq*1], 1
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [r6 +strideq*0], xm1
+ vextracti128 [r6 +strideq*1], m1, 1
+ RET
+ALIGN function_align
+.pass2_main:
+ vbroadcasti128 m4, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ lea r6, [deint_shuf+128]
+ vperm2i128 m1, m0, m2, 0x31
+ vinserti128 m0, xm2, 1
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp m(iadst_8x4_internal_8bpc).main
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_2896]
+ pmulld m0, m1, [cq+32*0]
+ pmulld m3, m1, [cq+32*3]
+ pmulld m2, m1, [cq+32*2]
+ pmulld m1, [cq+32*1]
+ vpbroadcastd m4, [pd_2048]
+ REPX {paddd x, m4}, m0, m3, m2, m1
+ REPX {psrad x, 12}, m0, m3, m2, m1
+.main2:
+ IADST4_1D
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_10bpc).main
+ shufpd m3, m4, m0, 0x05
+ shufpd m0, m4, 0x05
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032
+ psignd m2, m5, m6
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_10bpc).pass2_main
+ vpermq m2, m0, q2031
+ vpermq m0, m1, q2031
+ jmp m(iadst_8x4_internal_10bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m4, [pd_2896]
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpermq m2, [cq+32*2], q3120
+ vpermq m3, [cq+32*3], q3120
+ vpbroadcastd m7, [pd_2048]
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {paddd x, x }, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m5, [pixel_10bpc_max]
+ vpbroadcastd m4, [pw_1697x8]
+ packssdw m0, m1
+ packssdw m2, m3
+ pmulhrsw m1, m4, m0
+ pmulhrsw m4, m2
+ paddsw m0, m1
+ paddsw m2, m4
+ packssdw m7, m7 ; pw_2048
+.pass2_end:
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ lea r6, [dstq+strideq*2]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m7
+ pmulhrsw m0, m7
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova xm2, [dstq+strideq*0]
+ vinserti128 m2, [r6 +strideq*0], 1
+ mova xm3, [dstq+strideq*1]
+ vinserti128 m3, [r6 +strideq*1], 1
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [r6 +strideq*0], m0, 1
+ vextracti128 [r6 +strideq*1], m1, 1
+ RET
+
+INV_TXFM_8X4_FN dct, dct, 12
+INV_TXFM_8X4_FN dct, identity, 12
+INV_TXFM_8X4_FN dct, adst, 12
+INV_TXFM_8X4_FN dct, flipadst, 12
+
+cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ jmp m(idct_8x4_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(iadst_8x4_internal_12bpc).end
+
+INV_TXFM_8X4_FN adst, dct, 12
+INV_TXFM_8X4_FN adst, adst, 12
+INV_TXFM_8X4_FN adst, flipadst, 12
+INV_TXFM_8X4_FN adst, identity, 12
+
+cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ call m(iadst_4x8_internal_10bpc).main2
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call .pass2_main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass2_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
+.end:
+ vpbroadcastd m4, [pw_16384]
+ REPX {psrad x, 3}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m2, m4
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m1, m1, q3120 ; out2 out3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ jmp m(iadst_8x4_internal_10bpc).end2
+ALIGN function_align
+.pass2_main:
+ call .transpose_4x8
+ jmp m(iadst_8x4_internal_10bpc).main2
+ALIGN function_align
+.transpose_4x8:
+ ; deinterleave
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ pshufd m2, m2, q3120
+ pshufd m3, m3, q3120
+ ; transpose
+ punpcklqdq m4, m0, m1
+ punpckhqdq m0, m1
+ punpcklqdq m5, m2, m3
+ punpckhqdq m2, m3
+ vperm2i128 m1, m0, m2, 0x20 ; out1
+ vperm2i128 m3, m0, m2, 0x31 ; out3
+ vperm2i128 m2, m4, m5, 0x31 ; out2
+ vperm2i128 m0, m4, m5, 0x20 ; out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct, 12
+INV_TXFM_8X4_FN flipadst, adst, 12
+INV_TXFM_8X4_FN flipadst, flipadst, 12
+INV_TXFM_8X4_FN flipadst, identity, 12
+
+cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ call m(iadst_4x8_internal_10bpc).main2
+ shufpd m3, m4, m0, 0x05
+ shufpd m0, m4, 0x05
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032
+ psignd m2, m5, m6
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).pass2_main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m3, m5, m4
+ paddd m2, m5, m6
+ jmp m(iadst_8x4_internal_12bpc).pass2_end
+
+INV_TXFM_8X4_FN identity, dct, 12
+INV_TXFM_8X4_FN identity, adst, 12
+INV_TXFM_8X4_FN identity, flipadst, 12
+INV_TXFM_8X4_FN identity, identity, 12
+
+cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x4_internal_10bpc).pass1
+.pass2:
+ ; m0 = in0 in1 (interleaved)
+ ; m1 = in2 in3 (interleaved)
+ ; m2 = in4 in5 (interleaved)
+ ; m3 = in6 in7 (interleaved)
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ vpbroadcastd m4, [pd_5793]
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 15}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ vpbroadcastd m7, [pw_16384]
+ packssdw m0, m1
+ packssdw m2, m3
+ jmp m(iidentity_8x4_internal_10bpc).pass2_end
+
+%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 8x8, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly2:
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm2
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ paddsw m1, m0
+ psubusw m1, m2
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%else
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
+ ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
+ psubd m%9, m%3, m%7 ; t6
+ paddd m%3, m%7 ; t2
+ psubd m%7, m%1, m%5 ; t4
+ paddd m%1, m%5 ; t0
+ psubd m%5, m%6, m%2 ; t7
+ paddd m%6, m%2 ; t3
+ psubd m%2, m%8, m%4 ; t5
+ paddd m%8, m%4 ; t1
+ REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
+ REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
+ ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
+ psubd m%10, m%7, m%9 ; t7
+ paddd m%7, m%9 ; out6
+ vpbroadcastd m%9, [pd_1448]
+ psubd m%4, m%8, m%6 ; t3
+ paddd m%8, m%6 ; -out7
+ psubd m%6, m%1, m%3 ; t2
+ paddd m%1, m%3 ; out0
+ psubd m%3, m%2, m%5 ; t6
+ paddd m%2, m%5 ; -out1
+ REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
+ REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
+ REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
+ psubd m%5, m%6, m%4 ; (t2 - t3) * 1448
+ paddd m%4, m%6 ; (t2 + t3) * 1448
+ psubd m%6, m%3, m%10 ; (t6 - t7) * 1448
+ paddd m%3, m%10 ; (t6 + t7) * 1448
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ vpbroadcastd m11, [pd_2048]
+ call .main
+ call .round_shift1
+ jmp tx2q
+.pass2:
+ call .transpose_8x8_packed
+ call m(idct_8x8_internal_8bpc).main
+ vpbroadcastd m12, [pw_2048]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call .write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call .write_8x4
+ RET
+ALIGN function_align
+.write_8x4_start:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+.write_8x4:
+ mova xm8, [dstq+strideq*0]
+ vinserti128 m8, [dstq+strideq*1], 1
+ mova xm9, [dstq+strideq*2]
+ vinserti128 m9, [dstq+r6 ], 1
+ mova [cq+32*0], m10
+ mova [cq+32*1], m10
+ mova [cq+32*2], m10
+ mova [cq+32*3], m10
+ add cq, 32*4
+ paddw m0, m8
+ paddw m1, m9
+ pmaxsw m0, m10
+ pmaxsw m1, m10
+ pminsw m0, m11
+ pminsw m1, m11
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.transpose_8x8_packed:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m4, m1
+ punpckldq m4, m1
+ vinserti128 m1, m3, xm2, 1
+ vperm2i128 m3, m2, 0x31
+ vperm2i128 m2, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ ret
+ALIGN function_align
+.main_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ vpbroadcastd m3, [pd_2896]
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m3 }, m0, m4, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+ALIGN function_align
+.round_shift1:
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call .main
+ call .main_end
+ jmp tx2q
+.pass2:
+ call m(idct_8x8_internal_10bpc).transpose_8x8_packed
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd xm12, [pw_4096]
+ psubw m12, m5
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+32*0]
+ mova m7, [cq+32*7]
+ mova m1, [cq+32*1]
+ mova m6, [cq+32*6]
+ mova m2, [cq+32*2]
+ mova m5, [cq+32*5]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ vpbroadcastd m11, [pd_2048]
+.main2:
+ IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+ psrld m8, 10 ; pd_1
+ vpbroadcastd m9, [pd_3072]
+ ret
+ALIGN function_align
+.main_end:
+ paddd m0, m8
+ psubd m1, m8, m1
+ paddd m6, m8
+ psubd m7, m8, m7
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
+ ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
+ psubd m8, m9, m8 ; pd_3071
+ paddd m2, m9
+ psubd m3, m8, m3
+ paddd m4, m9
+ psubd m5, m8, m5
+ REPX {psrad x, 12}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_8x8_internal_10bpc).main
+ call .main_end
+ jmp tx2q
+.pass2:
+ call m(idct_8x8_internal_10bpc).transpose_8x8_packed
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd xm5, [pw_4096]
+ psubw m12, m5
+ vpermq m8, m3, q2031
+ vpermq m9, m2, q2031
+ vpermq m2, m1, q2031
+ vpermq m3, m0, q2031
+ pmulhrsw m0, m8, m12
+ pmulhrsw m1, m9, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.main_end:
+ paddd m10, m8, m0
+ psubd m0, m8, m7
+ psubd m7, m8, m1
+ paddd m1, m8, m6
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m7, 1
+ psrad m7, m10, 1
+ psubd m8, m9, m8 ; pd_6143
+ psubd m10, m8, m5
+ paddd m5, m9, m2
+ psubd m2, m8, m3
+ paddd m3, m9, m4
+ psrad m4, m2, 12
+ psrad m2, m10, 12
+ psrad m3, 12
+ psrad m5, 12
+ ret
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+.pass1:
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ jmp tx2q
+.pass2:
+ packssdw m3, m7
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass2_main:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ vpbroadcastd m12, [pw_4096]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ punpckhqdq m1, m0, m2 ; 1 5
+ punpcklqdq m0, m2 ; 0 4
+ punpcklqdq m2, m3, m4 ; 2 6
+ punpckhqdq m3, m4 ; 3 7
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call .write_2x8x2_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call .write_2x8x2_zero
+ RET
+.write_2x8x2_start:
+ lea r6, [strideq*5]
+ pxor m6, m6
+.write_2x8x2_zero:
+ mova [cq+32*0], m6
+ mova [cq+32*1], m6
+ mova [cq+32*2], m6
+ mova [cq+32*3], m6
+ add cq, 32*4
+.write_2x8x2:
+ mova xm4, [dstq+strideq*0]
+ vinserti128 m4, [dstq+strideq*4], 1
+ mova xm5, [dstq+strideq*1]
+ vinserti128 m5, [dstq+r6 ], 1
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pminsw m0, m7
+ pminsw m1, m7
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*4], m0, 1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*2]
+ ret
+
+%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
+ punpckldq m%9, m%1, m%2 ; aibj emfn
+ punpckhdq m%1, m%2 ; ckdl gohp
+ punpckldq m%10, m%3, m%4 ; qyrz uCvD
+ punpckhdq m%3, m%4 ; sAtB wExF
+ punpckldq m%11, m%5, m%6 ; GOHP KSLT
+ punpckhdq m%5, m%6 ; IQJR MUNV
+ punpckldq m%12, m%7, m%8 ; WeXf aibj
+ punpckhdq m%7, m%8 ; YgZh ckdl
+ punpcklqdq m%2, m%9, m%10 ; aiqy emuC
+ punpckhqdq m%9, m%10 ; bjrz fnvD
+ punpcklqdq m%4, m%1, m%3 ; cksA gowE
+ punpckhqdq m%10, m%1, m%3 ; dltB hpxF
+ punpcklqdq m%6, m%11, m%12 ; GOWe KSai
+ punpckhqdq m%11, m%12 ; HPXf LTbj
+ punpcklqdq m%8, m%5, m%7 ; IQYg MUck
+ punpckhqdq m%12, m%5, m%7 ; JRZh NVdl
+ vperm2i128 m%1, m%2, m%6, 0x20 ; out0
+ vperm2i128 m%5, m%2, m%6, 0x31 ; out4
+ vperm2i128 m%2, m%9, m%11, 0x20 ; out1
+ vperm2i128 m%6, m%9, m%11, 0x31 ; out5
+ vperm2i128 m%3, m%4, m%8, 0x20 ; out2
+ vperm2i128 m%7, m%4, m%8, 0x31 ; out6
+ vperm2i128 m%4, m%10, m%12, 0x20 ; out3
+ vperm2i128 m%8, m%10, m%12, 0x31 ; out7
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct, 12
+INV_TXFM_8X8_FN dct, identity, 12
+INV_TXFM_8X8_FN dct, adst, 12
+INV_TXFM_8X8_FN dct, flipadst, 12
+
+cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x8_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose_8x8
+ vpbroadcastd m11, [pd_2048]
+ call m(idct_8x8_internal_10bpc).main
+ call .round_shift4
+ jmp m(iadst_8x8_internal_12bpc).pass2_end
+ALIGN function_align
+.write_8x4_start:
+ vpbroadcastd m11, [pixel_12bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ ret
+ALIGN function_align
+.transpose_8x8:
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ ret
+ALIGN function_align
+.round_shift4:
+ vpbroadcastd m1, [pd_8]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct, 12
+INV_TXFM_8X8_FN adst, adst, 12
+INV_TXFM_8X8_FN adst, flipadst, 12
+INV_TXFM_8X8_FN adst, identity, 12
+
+cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+.pass2_end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ REPX {vpermq x, x, q3120}, m0, m1
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ packssdw m0, m4, m5
+ packssdw m1, m6, m7
+ REPX {vpermq x, x, q3120}, m0, m1
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.pass2_main:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ vpbroadcastd m11, [pd_2048]
+.pass2_main2:
+ call m(iadst_8x8_internal_10bpc).main2
+ pslld m9, m8, 3 ; pd_8
+ paddd m0, m9
+ psubd m1, m9, m1 ; 8+x
+ paddd m6, m9
+ psubd m7, m9, m7
+ REPX {psrad x, 4}, m0, m1, m6, m7
+ vpbroadcastd m9, [pd_17408]
+ psubd m8, m9, m8 ; 17407
+ paddd m2, m9
+ psubd m3, m8, m3
+ paddd m4, m9
+ psubd m5, m8, m5
+ REPX {psrad x, 15}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct, 12
+INV_TXFM_8X8_FN flipadst, adst, 12
+INV_TXFM_8X8_FN flipadst, flipadst, 12
+INV_TXFM_8X8_FN flipadst, identity, 12
+
+cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x8_internal_10bpc).pass1
+.pass2:
+ call m(iadst_8x8_internal_12bpc).pass2_main
+ packssdw m7, m7, m6
+ packssdw m6, m1, m0
+ packssdw m1, m5, m4
+ vpermq m0, m7, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ packssdw m0, m3, m2
+ vpermq m0, m0, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+
+INV_TXFM_8X8_FN identity, dct, 12
+INV_TXFM_8X8_FN identity, adst, 12
+INV_TXFM_8X8_FN identity, flipadst, 12
+INV_TXFM_8X8_FN identity, identity, 12
+
+cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x8_internal_10bpc).pass1
+.pass2:
+ packssdw m3, m7
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(iidentity_8x8_internal_10bpc).pass2_main
+
+%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 8x16, %4
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_%4bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call .pass1_main
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call .pass1_main
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call .pass1_main
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_8x16_internal_8bpc).main
+ vpbroadcastd m12, [pw_2048]
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m0, m4, m12
+ pmulhrsw m1, m5, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m0, m6, m12
+ pmulhrsw m1, m7, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m7, m15
+ lea r6, [deint_shuf+128]
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ punpckhdq m7, m3, m6
+ punpckldq m3, m6
+ punpckhdq m6, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ vperm2i128 m2, m0, m3, 0x31
+ vinserti128 m0, xm3, 1
+ vperm2i128 m3, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m7, m5, m6, 0x31
+ vinserti128 m5, xm6, 1
+ vperm2i128 m6, m8, m4, 0x31
+ vinserti128 m4, m8, xm4, 1
+ ret
+ALIGN function_align
+.pass1_main:
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m7, m14, [cq+32*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ jmp m(idct_8x8_internal_10bpc).round_shift1
+ALIGN function_align
+.main_evenhalf:
+ paddd m1, m6, m7 ; idct8 out1
+ psubd m6, m7 ; idct8 out6
+ psubd m7, m0, m9 ; idct8 out7
+ paddd m0, m9 ; idct8 out0
+ paddd m2, m5, m4 ; idct8 out2
+ psubd m5, m4 ; idct8 out5
+ psubd m4, m3, m8 ; idct8 out4
+ paddd m3, m8 ; idct8 out3
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+.main_oddhalf_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_fast: ; lower half zero
+ vpbroadcastd m7, [pd_4076]
+ vpbroadcastd m8, [pd_401]
+ vpbroadcastd m6, [pd_m1189]
+ vpbroadcastd m9, [pd_3920]
+ vpbroadcastd m5, [pd_3612]
+ vpbroadcastd m10, [pd_1931]
+ vpbroadcastd m4, [pd_m2598]
+ vpbroadcastd m15, [pd_3166]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_fast2
+.main_oddhalf_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf:
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r6-32*4], m7
+ mova [r6-32*3], m6
+ mova [r6-32*2], m5
+ mova [r6-32*1], m4
+ mova [r6+32*0], m3
+ mova [r6+32*1], m2
+ mova [r6+32*2], m1
+ mova [r6+32*3], m0
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, 35
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_8x16_internal_10bpc).transpose
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m8, [pw_2048]
+ vpbroadcastd xm12, [pw_4096]
+ REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+ REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+ psubw m12, m8
+ jmp m(idct_8x16_internal_10bpc).end
+ALIGN function_align
+.pass1_main:
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m7, m14, [cq+32*14]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp m(iadst_8x8_internal_10bpc).main2
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, 35
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_8x16_internal_10bpc).transpose
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd xm13, [pw_4096]
+ mova m11, m0
+ vpermq m0, m7, q2031
+ mova m10, m1
+ vpermq m1, m6, q2031
+ mova m9, m2
+ vpermq m2, m5, q2031
+ mova m8, m3
+ vpermq m3, m4, q2031
+ vpermq m4, m8, q3120
+ vpermq m5, m9, q3120
+ vpermq m6, m10, q3120
+ vpermq m7, m11, q3120
+ psubw m12, m13
+ jmp m(idct_8x16_internal_10bpc).end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+%ifnum %4
+ pmulhrsw m%2, m%4
+%else ; without rounding
+ psraw m%2, 1
+%endif
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m8, m15, [cq+32* 1]
+ pmulld m1, m15, [cq+32* 2]
+ pmulld m9, m15, [cq+32* 3]
+ pmulld m2, m15, [cq+32* 4]
+ pmulld m10, m15, [cq+32* 5]
+ pmulld m3, m15, [cq+32* 6]
+ pmulld m11, m15, [cq+32* 7]
+ pmulld m4, m15, [cq+32* 8]
+ pmulld m12, m15, [cq+32* 9]
+ pmulld m5, m15, [cq+32*10]
+ pmulld m13, m15, [cq+32*11]
+ pmulld m6, m15, [cq+32*12]
+ pmulld m14, m15, [cq+32*13]
+ pmulld m7, m15, [cq+32*14]
+ pmulld m15, [cq+32*15]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [cq]
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m8, [pw_1697x16]
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
+ vpbroadcastd m7, [pixel_10bpc_max]
+ vpbroadcastd m12, [pw_2048]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ punpckhwd m9, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m13
+ punpcklwd m6, m13
+ punpckhwd m13, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m2, m3
+ punpckhwd m2, m3
+ punpckhdq m3, m0, m5
+ punpckldq m0, m5
+ punpckhdq m11, m9, m2
+ punpckldq m9, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckldq m6, m13, m1
+ punpckhdq m13, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m8, m9, m6
+ punpckhqdq m9, m6
+ punpcklqdq m10, m11, m13
+ punpckhqdq m11, m13
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
+ pmulhrsw m0, m12, m2
+ pmulhrsw m1, m12, m3
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ pmulhrsw m0, m12, m8
+ pmulhrsw m1, m12, m9
+ lea dstq, [dstq+strideq*4]
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ pmulhrsw m0, m12, m10
+ pmulhrsw m1, m12, m11
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ ret
+
+INV_TXFM_8X16_FN dct, dct, 0, 12
+INV_TXFM_8X16_FN dct, identity, 35, 12
+INV_TXFM_8X16_FN dct, adst, 0, 12
+INV_TXFM_8X16_FN dct, flipadst, 0, 12
+
+cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*10], m2
+ mova [cq+32*12], m4
+ mova [cq+32*14], m6
+ pmaxsd m0, m12, [cq+32* 1]
+ pmaxsd m4, m12, m1
+ pmaxsd m1, m12, [cq+32* 3]
+ pmaxsd m2, m12, [cq+32* 5]
+ pmaxsd m6, m12, m5
+ pmaxsd m5, m12, m3
+ pmaxsd m3, m12, [cq+32* 7]
+ pmaxsd m7, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 2]
+ pmaxsd m2, m12, [cq+32* 4]
+ pmaxsd m3, m12, [cq+32* 6]
+ pmaxsd m4, m12, [cq+32* 8]
+ pmaxsd m5, m12, [cq+32*10]
+ pmaxsd m6, m12, [cq+32*12]
+ pmaxsd m7, m12, [cq+32*14]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ vpbroadcastd m11, [pd_8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+.end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ mova [cq+32* 3], m3
+ mova [cq+32* 4], m4
+ mova [cq+32* 5], m5
+ mova [cq+32* 6], m6
+ mova [cq+32* 7], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, m12
+ mova m5, m13
+ mova m6, m14
+ mova m7, m15
+ jmp m(idct_8x8_internal_12bpc).transpose_8x8
+
+INV_TXFM_8X16_FN adst, dct, 0, 12
+INV_TXFM_8X16_FN adst, adst, 0, 12
+INV_TXFM_8X16_FN adst, flipadst, 0, 12
+INV_TXFM_8X16_FN adst, identity, 35, 12
+
+cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .pass2_main
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_end:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp m(idct_8x16_internal_12bpc).end
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*15], m7
+ pmaxsd m0, m13, [cq+32* 2] ; 2
+ pmaxsd m3, m13, m1 ; 9
+ pmaxsd m1, m13, m5 ; 13
+ pmaxsd m4, m13, m2 ; 10
+ pmaxsd m2, m13, [cq+32* 6] ; 6
+ pmaxsd m5, m13, [cq+32* 5] ; 5
+ pmaxsd m6, m13, m6 ; 14
+ pmaxsd m7, m13, [cq+32* 1] ; 1
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m1, m13, [cq+32*15] ; 15
+ pmaxsd m2, m13, [cq+32* 4] ; 4
+ pmaxsd m3, m13, [cq+32*11] ; 11
+ pmaxsd m4, m13, [cq+32* 8] ; 8
+ pmaxsd m5, m13, [cq+32* 7] ; 7
+ pmaxsd m6, m13, [cq+32*12] ; 12
+ pmaxsd m7, m13, [cq+32* 3] ; 3
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_17408]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_17407
+ pslld m15, 3 ; pd_8
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct, 0, 12
+INV_TXFM_8X16_FN flipadst, adst, 0, 12
+INV_TXFM_8X16_FN flipadst, flipadst, 0, 12
+INV_TXFM_8X16_FN flipadst, identity, 35, 12
+
+cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call m(iadst_8x16_internal_12bpc).pass2_main
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_8x16_internal_12bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct, 0, 12
+INV_TXFM_8X16_FN identity, adst, 0, 12
+INV_TXFM_8X16_FN identity, flipadst, 0, 12
+INV_TXFM_8X16_FN identity, identity, 0, 12
+
+cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x16_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m7, [pixel_12bpc_max]
+ vpbroadcastd m12, [pw_16384]
+ call m(iidentity_8x16_internal_10bpc).pass2_end
+ RET
+ALIGN function_align
+.pass2_main:
+ mova [cq], m7
+ vpbroadcastd m7, [clip_18b_min]
+ REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmaxsd m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [clip_18b_max]
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pminsd m15, [cq]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_5793]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmulld m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [pd_1024]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ret
+
+%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 16x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m3, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm3
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+strideq*0]
+ paddsw m2, m0, [dstq+strideq*1]
+ psubusw m1, m3
+ psubusw m2, m3
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%else
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.pass1:
+ vbroadcasti128 m0, [cq+16* 0]
+ vbroadcasti128 m4, [cq+16* 4]
+ vbroadcasti128 m1, [cq+16* 2]
+ vbroadcasti128 m7, [cq+16* 6]
+ vbroadcasti128 m5, [cq+16*10]
+ vbroadcasti128 m2, [cq+16* 8]
+ vbroadcasti128 m6, [cq+16*12]
+ vbroadcasti128 m3, [cq+16*14]
+ shufpd m0, m4, 0x0c ; 0 4
+ shufpd m1, m5, 0x0c ; 2 10
+ shufpd m2, m6, 0x0c ; 8 12
+ shufpd m3, m7, 0x0c ; 14 6
+ call .pass1_main
+ vbroadcasti128 m10, [cq+16* 1]
+ vbroadcasti128 m4, [cq+16* 5]
+ vbroadcasti128 m11, [cq+16*15]
+ vbroadcasti128 m5, [cq+16*11]
+ shufpd m10, m4, 0x0c ; 1 5
+ shufpd m11, m5, 0x0c ; 15 11
+ vbroadcasti128 m5, [cq+16* 9]
+ vbroadcasti128 m4, [cq+16*13]
+ shufpd m5, m4, 0x0c ; 9 13
+ vbroadcasti128 m6, [cq+16* 7]
+ vbroadcasti128 m4, [cq+16* 3]
+ shufpd m6, m4, 0x0c ; 7 3
+ call .pass1_main2
+ pcmpeqd m4, m4
+ REPX {psubd x, m4}, m0, m1, m2, m3
+ call .pass1_main3
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(idct_16x4_internal_8bpc).main
+.end:
+ vpbroadcastd m4, [pw_2048]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_10bpc_max]
+.end2:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+.end3:
+ lea r6, [dstq+strideq*2]
+ paddw m2, [r6 +strideq*0]
+ paddw m3, [r6 +strideq*1]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [r6 +strideq*0], m2
+ mova [r6 +strideq*1], m3
+ RET
+ALIGN function_align
+.pass1_main:
+ vpbroadcastd m7, [pd_2048]
+ call m(idct_8x4_internal_10bpc).main
+ psubd m3, m0, m4 ; idct8 out7 out6
+ paddd m0, m4 ; idct8 out0 out1
+ paddd m1, m2, m5 ; idct8 out3 out2
+ psubd m2, m5 ; idct8 out4 out5
+ ret
+ALIGN function_align
+.pass1_main2:
+ ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
+ ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
+ vbroadcasti128 m12, [pd_3784_m3784]
+ psubd m4, m10, m5
+ paddd m10, m5 ; t8 t11
+ psignd m4, m12 ; t9 t10
+ psubd m5, m11, m6
+ paddd m11, m6 ; t15 t12
+ psignd m5, m12 ; t14 t13
+ vpbroadcastd m6, [pd_1567]
+ vpbroadcastd m13, [pd_3784]
+ REPX {pmaxsd x, m8}, m5, m4
+ REPX {pminsd x, m9}, m5, m4
+ pmulld m12, m5
+ pmulld m5, m6
+ vbroadcasti128 m6, [pd_1567_m1567]
+ pmulld m13, m4
+ pmulld m4, m6
+ REPX {pmaxsd x, m8}, m10, m11, m0, m1
+ REPX {pminsd x, m9}, m10, m11, m0, m1
+ paddd m12, m7
+ paddd m5, m7
+ paddd m4, m12
+ psubd m5, m13
+ psrad m4, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ vpbroadcastd m12, [pd_2896]
+ punpckhqdq m6, m11, m5
+ punpcklqdq m11, m4
+ punpckhqdq m4, m10, m4
+ punpcklqdq m10, m5
+ psubd m5, m11, m6 ; t12a t13
+ paddd m11, m6 ; t15a t14
+ psubd m6, m10, m4 ; t11a t10
+ paddd m10, m4 ; t8a t9
+ REPX {pmaxsd x, m8}, m5, m6
+ REPX {pminsd x, m9}, m5, m6
+ pmulld m5, m12
+ pmulld m6, m12
+ REPX {pmaxsd x, m8}, m2, m3, m11, m10
+ REPX {pminsd x, m9}, m2, m3, m11, m10
+ ret
+ALIGN function_align
+.pass1_main3:
+ paddd m5, m7
+ psubd m4, m5, m6
+ paddd m5, m6
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ psubd m7, m0, m11 ; out15 out14
+ paddd m0, m11 ; out0 out1
+ psubd m6, m1, m5 ; out12 out13
+ paddd m1, m5 ; out3 out2
+ psubd m5, m2, m4 ; out11 out10
+ paddd m2, m4 ; out4 out5
+ psubd m4, m3, m10 ; out8 out9
+ paddd m3, m10 ; out7 out6
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ ret
+ALIGN function_align
+.transpose_4x16_packed:
+ vbroadcasti128 m8, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ REPX {pshufb x, m8}, m0, m2, m4, m6
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m2, m4, m6
+ punpcklqdq m4, m6
+ vperm2i128 m3, m1, m2, 0x31
+ vinserti128 m1, xm2, 1
+ vperm2i128 m2, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_4x16_internal_10bpc).main
+ psrad m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ paddd m4, m5, m11
+ paddd m5, m6, m11
+ paddd m6, m7, m11
+ paddd m7, m8, m11
+.pass1_end:
+ REPX {pshufd x, x, q1032}, m0, m2, m4, m6
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(iadst_16x4_internal_8bpc).main
+ jmp m(idct_16x4_internal_10bpc).end
+ALIGN function_align
+.main:
+ vpbroadcastd m6, [pd_1321]
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ vpbroadcastd m7, [pd_2482]
+ mova m2, [cq+32*6]
+ mova m3, [cq+32*7]
+ pmulld m4, m0, m6
+ pmulld m5, m1, m6 ; 1321*in0
+ pmulld m9, m2, m7
+ pmulld m8, m3, m7 ; 2482*in3
+ paddd m4, m9
+ paddd m8, m5 ; 1321*in0 + 2482*in3
+ pmulld m5, m0, m7
+ pmulld m9, m1, m7 ; 2482*in0
+ paddd m0, m2
+ paddd m1, m3 ; in0 + in3
+ paddd m7, m6 ; pd_3803
+ pmulld m2, m7
+ pmulld m3, m7 ; 3803*in3
+ psubd m5, m2
+ psubd m9, m3 ; 2482*in0 - 3803*in3
+ mova m2, [cq+32*4]
+ pmulld m10, m7, m2
+ pmulld m3, m6, m2
+ psubd m2, m0
+ mova m0, [cq+32*5]
+ pmulld m7, m0 ; 3803*in2
+ pmulld m6, m0 ; 1321*in2
+ psubd m0, m1 ; in2 - in0 - in3
+ vpbroadcastd m1, [pd_m3344]
+ paddd m4, m10
+ paddd m7, m8 ; t0
+ psubd m5, m3
+ psubd m9, m6 ; t1
+ pmulld m2, m1
+ pmulld m0, m1 ; t2
+ pmulld m3, m1, [cq+32*2]
+ pmulld m1, [cq+32*3] ; -t3
+ ret
+ALIGN function_align
+.main_end:
+ ; expects: m6 = rnd
+ paddd m5, m6
+ paddd m9, m6
+ paddd m10, m4, m5
+ paddd m4, m6
+ paddd m8, m7, m6
+ paddd m7, m9
+ psubd m4, m3 ; out0 (unshifted)
+ psubd m5, m3 ; out1 (unshifted)
+ paddd m2, m6 ; out2 (unshifted)
+ paddd m3, m10 ; out3 (unshifted)
+ psubd m8, m1 ; out4 (unshifted)
+ psubd m9, m1 ; out5 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ paddd m7, m1 ; out7 (unshifted)
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_4x16_internal_10bpc).main
+ psrad m11, 11 ; pd_1
+ paddd m4, m3, m11
+ paddd m3, m5, m11
+ paddd m5, m2, m11
+ paddd m2, m6, m11
+ paddd m6, m1, m11
+ paddd m1, m7, m11
+ paddd m7, m0, m11
+ paddd m0, m8, m11
+ jmp m(iadst_16x4_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m4, [pw_2048]
+ pmulhrsw m5, m3, m4
+ pmulhrsw m6, m2, m4
+ pmulhrsw m2, m1, m4
+ pmulhrsw m3, m0, m4
+ paddw m0, m5, [dstq+strideq*0]
+ paddw m1, m6, [dstq+strideq*1]
+ vpbroadcastd m5, [pixel_10bpc_max]
+ jmp m(idct_16x4_internal_10bpc).end3
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_5793]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m4, [cq+32*4], q3120 ; 8 9
+ vpermq m5, [cq+32*5], q3120 ; a b
+ vpermq m6, [cq+32*6], q3120 ; c d
+ vpermq m7, [cq+32*7], q3120 ; e f
+ vpbroadcastd m9, [pd_3072]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ vpbroadcastd m7, [pw_1697x8]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_16x4_internal_10bpc).end
+
+INV_TXFM_16X4_FN dct, dct, 12
+INV_TXFM_16X4_FN dct, identity, 12
+INV_TXFM_16X4_FN dct, adst, 12
+INV_TXFM_16X4_FN dct, flipadst, 12
+
+cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ jmp m(idct_16x4_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ ; deinterleave
+ REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+ ; transpose
+ punpcklqdq m8, m0, m1
+ punpckhqdq m0, m1
+ punpcklqdq m9, m2, m3
+ punpckhqdq m2, m3
+ punpcklqdq m10, m4, m5
+ punpckhqdq m4, m5
+ punpcklqdq m11, m6, m7
+ punpckhqdq m6, m7
+ vperm2i128 m3, m0, m2, 0x31 ; out6
+ vperm2i128 m1, m0, m2, 0x20 ; out2
+ vperm2i128 m7, m4, m6, 0x31 ; out7
+ vperm2i128 m5, m4, m6, 0x20 ; out3
+ vperm2i128 m13, m10, m11, 0x31 ; out5
+ vperm2i128 m12, m10, m11, 0x20 ; out1
+ vperm2i128 m11, m8, m9, 0x31 ; out4
+ vperm2i128 m10, m8, m9, 0x20 ; out0
+ call m(idct_4x16_internal_10bpc).pass1_main
+ pmulld m0, m6, m10
+ pmulld m2, m6, m11
+ pmulld m4, m6, m12
+ pmulld m6, m13
+ vpbroadcastd m10, [pd_17408]
+ call m(idct_4x16_internal_10bpc).pass1_main2
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m5, [pixel_12bpc_max]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ jmp m(idct_16x4_internal_10bpc).end2
+
+INV_TXFM_16X4_FN adst, dct, 12
+INV_TXFM_16X4_FN adst, adst, 12
+INV_TXFM_16X4_FN adst, flipadst, 12
+INV_TXFM_16X4_FN adst, identity, 12
+
+cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_16x4_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ jmp m(idct_16x4_internal_10bpc).end2
+ALIGN function_align
+.pass2_main:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7
+ pmaxsd m8, m4, m12
+ pmaxsd m9, m5, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ mova [cq+32*0], m0
+ mova [cq+32*2], m1
+ mova [cq+32*4], m2
+ mova [cq+32*6], m3
+ pminsd m0, m8, m13
+ pminsd m1, m9, m13
+ pminsd m2, m6, m13
+ pminsd m3, m7, m13
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ mova [cq+32*1], m0
+ mova [cq+32*3], m1
+ mova [cq+32*5], m2
+ mova [cq+32*7], m3
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_2048]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m4, 15
+ psrad m1, m5, 15
+ psrad m2, 15
+ psrad m3, 15
+ psrad m4, m8, 15
+ psrad m5, m9, 15
+ psrad m6, 15
+ psrad m7, 15
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m4, [pw_16384]
+ vpbroadcastd m5, [pixel_12bpc_max]
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct, 12
+INV_TXFM_16X4_FN flipadst, adst, 12
+INV_TXFM_16X4_FN flipadst, flipadst, 12
+INV_TXFM_16X4_FN flipadst, identity, 12
+
+cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_16x4_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x4_internal_12bpc).pass2_main
+ vpermq m7, m0, q3120
+ vpermq m6, m1, q3120
+ vpermq m1, m2, q3120
+ vpermq m0, m3, q3120
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m6, m4
+ pmulhrsw m3, m7, m4
+ jmp m(idct_16x4_internal_10bpc).end2
+
+INV_TXFM_16X4_FN identity, dct, 12
+INV_TXFM_16X4_FN identity, adst, 12
+INV_TXFM_16X4_FN identity, flipadst, 12
+INV_TXFM_16X4_FN identity, identity, 12
+
+cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_1697]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpbroadcastd m9, [pd_3072]
+ pmulld m4, m8, m0
+ pmulld m5, m8, m1
+ pmulld m6, m8, m2
+ pmulld m7, m8, m3
+ vpermq m10, [cq+32*4], q3120 ; 8 9
+ vpermq m11, [cq+32*5], q3120 ; a b
+ vpermq m12, [cq+32*6], q3120 ; c d
+ vpermq m13, [cq+32*7], q3120 ; e f
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m0, m4
+ pmulld m4, m8, m10
+ paddd m1, m5
+ pmulld m5, m8, m11
+ paddd m2, m6
+ pmulld m6, m8, m12
+ paddd m3, m7
+ pmulld m7, m8, m13
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m4, m10
+ paddd m5, m11
+ paddd m6, m12
+ paddd m7, m13
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_2048]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ vpbroadcastd m4, [pw_16384]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ jmp m(idct_16x4_internal_10bpc).end2
+
+%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 16x8, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%3bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ pmulld m0, m14, [cq+32* 1]
+ pmulld m1, m14, [cq+32* 3]
+ pmulld m2, m14, [cq+32* 5]
+ pmulld m3, m14, [cq+32* 7]
+ pmulld m4, m14, [cq+32* 9]
+ pmulld m5, m14, [cq+32*11]
+ pmulld m6, m14, [cq+32*13]
+ pmulld m7, m14, [cq+32*15]
+ vpbroadcastd m11, [pd_2048]
+ lea r6, [rsp+32*4]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m7, m14, [cq+32*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .pass1_rotations
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [pw_2048]
+.end:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call .write_16x4_start
+.end2:
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m10
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m10
+ call .write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ mova m14, [r6-32*4]
+ mova m13, [r6-32*3]
+ mova m12, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m10, [r6+32*0]
+ mova m9, [r6+32*1]
+ mova m8, [r6+32*2]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r6+32*3] ; out8
+ paddd m7, [r6+32*3] ; out7
+ ret
+ALIGN function_align
+.transpose:
+ lea r6, [deint_shuf+128]
+.transpose2:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m7, m15
+.transpose3:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ punpckhdq m7, m4, m6
+ punpckldq m4, m6
+ punpckldq m6, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpcklqdq m5, m6, m3
+ punpckhqdq m6, m3
+ punpckhqdq m3, m2, m7
+ punpcklqdq m2, m7
+ punpcklqdq m7, m8, m1
+ punpckhqdq m8, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ vperm2i128 m4, m0, m5, 0x31
+ vinserti128 m0, xm5, 1
+ vperm2i128 m5, m1, m6, 0x31
+ vinserti128 m1, xm6, 1
+ vperm2i128 m6, m2, m7, 0x31
+ vinserti128 m2, xm7, 1
+ vperm2i128 m7, m3, m8, 0x31
+ vinserti128 m3, xm8, 1
+ ret
+ALIGN function_align
+.write_16x4_start:
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+.write_16x4_zero:
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 32*8
+.write_16x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3 ]
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ lea r6, [rsp+32*4]
+ call .main
+ vpbroadcastd m14, [pd_3072]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_3071
+ call .pass1_rotations
+.pass1_end:
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp tx2q
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ vpbroadcastd m10, [pw_2048]
+ pxor m11, m11
+ psubw m11, m10
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ pmulhrsw m2, m10
+ pmulhrsw m3, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m11
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+ ret
+ALIGN function_align
+.main:
+ ; expects: m13 = clip_min m14 = clip_max
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 2]
+ pmulld m1, m15, [cq+32*13]
+ pmulld m2, m15, [cq+32* 6]
+ pmulld m3, m15, [cq+32* 9]
+ pmulld m4, m15, [cq+32*10]
+ pmulld m5, m15, [cq+32* 5]
+ pmulld m6, m15, [cq+32*14]
+ pmulld m7, m15, [cq+32* 1]
+ vpbroadcastd m12, [pd_2048]
+ REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ call .main_part1
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m1, m15, [cq+32*15]
+ pmulld m2, m15, [cq+32* 4]
+ pmulld m3, m15, [cq+32*11]
+ pmulld m4, m15, [cq+32* 8]
+ pmulld m5, m15, [cq+32* 7]
+ pmulld m6, m15, [cq+32*12]
+ pmulld m7, m15, [cq+32* 3]
+ REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_part2:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
+ vpbroadcastd m11, [pd_4017]
+ vpbroadcastd m10, [pd_799]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
+ vpbroadcastd m11, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11
+ pminsd m10, m14, [r6-32*4] ; t2
+ pminsd m8, m14, [r6-32*3] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ pmaxsd m9, m13
+ pmaxsd m10, m13
+ pminsd m9, m14
+ pminsd m10, m14
+ mova [r6-32*4], m1
+ mova m11, [r6-32*1] ; t7a
+ mova m1, [r6-32*2] ; t6a
+ psubd m8, m3, m11 ; t7
+ paddd m11, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ pmaxsd m8, m13
+ pmaxsd m2, m13
+ pminsd m8, m14
+ pminsd m2, m14
+ mova [r6-32*1], m11
+ mova [r6-32*3], m2
+ mova m1, [r6+32*3] ; t15
+ mova m2, [r6+32*2] ; t14
+ paddd m12, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ pmaxsd m7, m13
+ pmaxsd m11, m13
+ pminsd m7, m14
+ pminsd m11, m14
+ mova [r6-32*2], m12
+ pminsd m1, m14, [r6+32*0] ; t10a
+ pminsd m12, m14, [r6+32*1] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m12 ; t11
+ paddd m5, m12 ; out14
+ vpbroadcastd m12, [pd_1448]
+ pmaxsd m6, m13
+ pmaxsd m4, m13
+ pminsd m6, m14
+ pminsd m4, m14
+ REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
+ pmulld m12, [r6-32*3] ; t6
+ mova [r6-32*3], m5
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+ ret
+.main_part1:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
+ vpbroadcastd m11, [pd_2276]
+ vpbroadcastd m10, [pd_3406]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
+ vpbroadcastd m11, [pd_1567]
+ vpbroadcastd m10, [pd_3784]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ lea r6, [rsp+32*4]
+ call m(iadst_16x8_internal_10bpc).main
+ vpbroadcastd m14, [pd_3072]
+ psrld m15, 11
+ psubd m13, m14, m15
+ call .pass1_rotations
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ vpbroadcastd m10, [pw_2048]
+ pxor m11, m11
+ psubw m11, m10
+ mova m12, m0
+ pmulhrsw m0, m7, m11
+ mova m7, m1
+ pmulhrsw m1, m6, m10
+ mova m6, m2
+ pmulhrsw m2, m5, m11
+ mova m5, m3
+ pmulhrsw m3, m4, m10
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m5, m11
+ pmulhrsw m1, m6, m10
+ pmulhrsw m2, m7, m11
+ pmulhrsw m3, m12, m10
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ ret
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m1, m15, [cq+32* 1]
+ pmulld m2, m15, [cq+32* 2]
+ pmulld m3, m15, [cq+32* 3]
+ pmulld m4, m15, [cq+32* 4]
+ pmulld m5, m15, [cq+32* 5]
+ pmulld m6, m15, [cq+32* 6]
+ pmulld m7, m15, [cq+32* 7]
+ pmulld m8, m15, [cq+32* 8]
+ pmulld m9, m15, [cq+32* 9]
+ pmulld m10, m15, [cq+32*10]
+ pmulld m11, m15, [cq+32*11]
+ pmulld m12, m15, [cq+32*12]
+ pmulld m13, m15, [cq+32*13]
+ pmulld m14, m15, [cq+32*14]
+ pmulld m15, [cq+32*15]
+ mova [rsp], m7
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [rsp]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [rsp], m15
+ vpbroadcastd m15, [pd_5793]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [rsp]
+ mova [rsp], m7
+ vpbroadcastd m7, [pd_3072]
+ REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [rsp]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ vpbroadcastd m10, [pw_4096]
+ jmp m(idct_16x8_internal_10bpc).end
+
+INV_TXFM_16X8_FN dct, dct, 12
+INV_TXFM_16X8_FN dct, identity, 12
+INV_TXFM_16X8_FN dct, adst, 12
+INV_TXFM_16X8_FN dct, flipadst, 12
+
+cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_16x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ RET
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m11, [pd_2048]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_12bpc).round_shift4
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ pmaxsd m0, m12, [cq+32*0]
+ pmaxsd m1, m12, [cq+32*1]
+ pmaxsd m2, m12, [cq+32*2]
+ pmaxsd m3, m12, [cq+32*3]
+ pmaxsd m4, m12, [cq+32*4]
+ pmaxsd m5, m12, [cq+32*5]
+ pmaxsd m6, m12, [cq+32*6]
+ pmaxsd m7, m12, [cq+32*7]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_12bpc).round_shift4
+.end:
+ packssdw m0, [cq+32* 8]
+ packssdw m1, [cq+32* 9]
+ packssdw m2, [cq+32*10]
+ packssdw m3, [cq+32*11]
+ packssdw m4, [cq+32*12]
+ packssdw m5, [cq+32*13]
+ packssdw m6, [cq+32*14]
+ packssdw m7, [cq+32*15]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ call .write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ vpermq m2, m6, q3120
+ vpermq m3, m7, q3120
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.write_16x4_start:
+ vpbroadcastd m9, [pixel_12bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+ ret
+
+INV_TXFM_16X8_FN adst, dct, 12
+INV_TXFM_16X8_FN adst, adst, 12
+INV_TXFM_16X8_FN adst, flipadst, 12
+INV_TXFM_16X8_FN adst, identity, 12
+
+cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iadst_16x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ call m(idct_16x8_internal_12bpc).end
+ RET
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m11, [pd_2048]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_8x8_internal_12bpc).pass2_main2
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ pmaxsd m0, m12, [cq+32*0]
+ pmaxsd m1, m12, [cq+32*1]
+ pmaxsd m2, m12, [cq+32*2]
+ pmaxsd m3, m12, [cq+32*3]
+ pmaxsd m4, m12, [cq+32*4]
+ pmaxsd m5, m12, [cq+32*5]
+ pmaxsd m6, m12, [cq+32*6]
+ pmaxsd m7, m12, [cq+32*7]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_8x8_internal_12bpc).pass2_main2
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct, 12
+INV_TXFM_16X8_FN flipadst, adst, 12
+INV_TXFM_16X8_FN flipadst, flipadst, 12
+INV_TXFM_16X8_FN flipadst, identity, 12
+
+cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iflipadst_16x8_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x8_internal_12bpc).pass2_main
+ packssdw m13, m0, [cq+32* 8]
+ packssdw m12, m1, [cq+32* 9]
+ packssdw m11, m2, [cq+32*10]
+ packssdw m10, m3, [cq+32*11]
+ packssdw m3, m4, [cq+32*12]
+ packssdw m2, m5, [cq+32*13]
+ packssdw m1, m6, [cq+32*14]
+ packssdw m0, m7, [cq+32*15]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpermq m0, m10, q3120
+ vpermq m1, m11, q3120
+ vpermq m2, m12, q3120
+ vpermq m3, m13, q3120
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+
+INV_TXFM_16X8_FN identity, dct, 12
+INV_TXFM_16X8_FN identity, adst, 12
+INV_TXFM_16X8_FN identity, flipadst, 12
+INV_TXFM_16X8_FN identity, identity, 12
+
+cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_16x8_internal_10bpc).pass1
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose2
+ vpbroadcastd m10, [pw_4096]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ jmp m(idct_16x8_internal_10bpc).end2
+
+%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 16x16, %4
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%4bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call .main
+ sub cq, 32
+ mova m10, [r6-32*4]
+ mova m9, [r6-32*3]
+ mova m8, [r6-32*2]
+ psubd m15, m0, m10 ; out15
+ paddd m0, m10 ; out0
+ psubd m10, m1, m9 ; out14
+ paddd m1, m9 ; out1
+ psubd m9, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ REPX {psrad x, 2}, m0, m1, m2
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova m2, [r6-32*1]
+ mova m1, [r6+32*0]
+ mova m0, [r6+32*1]
+ REPX {psrad x, 2}, m9, m10, m15
+ psubd m8, m3, m2 ; out12
+ paddd m3, m2 ; out3
+ psubd m2, m4, m1 ; out11
+ paddd m4, m1 ; out4
+ psubd m1, m5, m0 ; out10
+ paddd m5, m0 ; out5
+ REPX {psrad x, 2}, m3, m4, m5
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova m4, [r6+32*2]
+ mova m3, [r6+32*3]
+ REPX {psrad x, 2}, m1, m2, m8
+ psubd m5, m6, m4 ; out9
+ paddd m6, m4 ; out6
+ psubd m4, m7, m3 ; out8
+ paddd m7, m3 ; out7
+ REPX {psrad x, 2}, m6, m7, m4, m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ add r6, 32*8
+ mova [r6-32*4], m4
+ mova [r6-32*3], m5
+ mova [r6-32*2], m1
+ mova [r6-32*1], m2
+ mova [r6+32*0], m8
+ mova [r6+32*1], m9
+ mova [r6+32*2], m10
+ mova [r6+32*3], m15
+.fast:
+ add r6, 32*8
+ call .main
+ mova m14, [r6-32*4]
+ mova m13, [r6-32*3]
+ mova m12, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m10, [r6+32*0]
+ mova m9, [r6+32*1]
+ mova m8, [r6+32*2]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r6+32*3] ; out8
+ paddd m7, [r6+32*3] ; out7
+ sub r6, 32*8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+.end:
+ call .write_16x16
+ RET
+ALIGN function_align
+.write_16x16:
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_2048]
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+.write_16x16_2:
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m12, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m12, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+gprsize+32*0]
+ pmulhrsw m1, m12, [rsp+gprsize+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m12, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+gprsize+32*2]
+ pmulhrsw m1, m12, m13
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m12, m15
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.transpose:
+ test eobd, eobd
+ jl .transpose_fast
+ packssdw m8, [r6-32*4]
+ packssdw m9, [r6-32*3]
+ packssdw m10, [r6-32*2]
+ packssdw m11, [r6-32*1]
+ packssdw m12, [r6+32*0]
+ packssdw m13, [r6+32*1]
+ packssdw m14, [r6+32*2]
+ packssdw m15, [r6+32*3]
+ sub r6, 32*8
+ packssdw m0, [r6-32*4]
+ packssdw m1, [r6-32*3]
+ packssdw m2, [r6-32*2]
+ packssdw m3, [r6-32*1]
+ packssdw m4, [r6+32*0]
+ packssdw m5, [r6+32*1]
+ packssdw m6, [r6+32*2]
+ packssdw m7, [r6+32*3]
+ mova [r6], m8
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m3, m6, m7
+ punpcklwd m6, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m7, m6
+ punpckldq m7, m6
+ punpckhdq m6, m4, m3
+ punpckldq m4, m3
+ punpckhqdq m3, m2, m1
+ punpcklqdq m2, m1
+ punpckhqdq m1, m0, m7
+ punpcklqdq m0, m7
+ punpcklqdq m7, m8, m6
+ punpckhqdq m8, m6
+ punpckhqdq m6, m5, m4
+ punpcklqdq m5, m4
+ mova m4, [r6]
+ mova [r6], m8
+ punpcklwd m8, m4, m9
+ punpckhwd m4, m9
+ punpcklwd m9, m10, m11
+ punpckhwd m10, m11
+ punpckhwd m11, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m12, m13
+ punpcklwd m12, m13
+ punpckldq m13, m4, m10
+ punpckhdq m4, m10
+ punpckhdq m10, m8, m9
+ punpckldq m8, m9
+ punpckhdq m9, m12, m14
+ punpckldq m12, m14
+ punpckhdq m14, m15, m11
+ punpckldq m15, m11
+ punpckhqdq m11, m10, m9
+ punpcklqdq m10, m9
+ punpckhqdq m9, m8, m12
+ punpcklqdq m8, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m4, m14
+ punpcklqdq m14, m4, m14
+ vperm2i128 m4, m0, m8, 0x31
+ vinserti128 m0, xm8, 1
+ vinserti128 m8, m5, xm12, 1
+ vperm2i128 m12, m5, 0x13
+ vperm2i128 m5, m1, m9, 0x31
+ vinserti128 m1, xm9, 1
+ vinserti128 m9, m6, xm13, 1
+ vperm2i128 m13, m6, 0x13
+ vperm2i128 m6, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vinserti128 m10, m7, xm14, 1
+ vperm2i128 m14, m7, 0x13
+ vperm2i128 m7, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ mova xm11, [r6]
+ vinserti128 m11, xm15, 1
+ vinserti128 m15, [r6+16], 0
+ ret
+.transpose_fast:
+ call m(idct_16x8_internal_10bpc).transpose2
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ ret
+ALIGN function_align
+.main:
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64* 3]
+ mova m2, [cq+64* 5]
+ mova m3, [cq+64* 7]
+ mova m4, [cq+64* 9]
+ mova m5, [cq+64*11]
+ mova m6, [cq+64*13]
+ mova m7, [cq+64*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m10, m11, 10 ; pd_2
+ REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call .main
+ sub cq, 32
+ vpbroadcastd m8, [pd_5120]
+ paddd m4, m8
+ paddd m6, m8
+ paddd m9, m8
+ paddd m11, m8
+ vpbroadcastd m8, [pd_5119]
+ psubd m5, m8, m5
+ psubd m7, m8, m7
+ psubd m10, m8, m10
+ psubd m12, m8, m12
+ REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ psrld m4, m15, 10 ; pd_2
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m2, m4
+ psubd m3, m4, m3
+ psubd m7, m4, [r6-32*4]
+ paddd m6, m4, [r6-32*3]
+ psubd m5, m4, [r6-32*2]
+ paddd m4, [r6-32*1]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ add r6, 32*8
+ mova [r6-32*4], m9
+ mova [r6-32*3], m10
+ mova [r6-32*2], m11
+ mova [r6-32*1], m12
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+.fast:
+ add r6, 32*8
+ call .main
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
+ psrld m15, 10 ; pd_2
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+.pass1_end:
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+ sub r6, 32*8
+ jmp tx2q
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ mova [rsp+32*0], m8
+ mova [rsp+32*2], m12
+ mova [rsp+32*3], m13
+ vpbroadcastd m12, [pw_2048]
+ pxor m13, m13
+ psubw m13, m12
+ pmulhrsw m0, m12
+ pmulhrsw m1, m13, [rsp+32*1]
+ mova [rsp+32*1], m9
+ pmulhrsw m2, m12
+ pmulhrsw m3, m13
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m13, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m13, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+32*0]
+ pmulhrsw m1, m13, [rsp+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m13, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+32*2]
+ pmulhrsw m1, m13, [rsp+32*3]
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m13, m15
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64* 2]
+ mova m1, [cq+64*13]
+ mova m2, [cq+64* 6]
+ mova m3, [cq+64* 9]
+ mova m4, [cq+64*10]
+ mova m5, [cq+64* 5]
+ mova m6, [cq+64*14]
+ mova m7, [cq+64* 1]
+ vpbroadcastd m12, [pd_2048]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64*15]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64*11]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64* 7]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64* 3]
+ jmp m(iadst_16x8_internal_10bpc).main_part2
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call m(iadst_16x16_internal_10bpc).main
+ sub cq, 32
+ vpbroadcastd m8, [pd_5120]
+ paddd m11, m8
+ paddd m9, m8
+ paddd m6, m8
+ paddd m4, m8
+ vpbroadcastd m8, [pd_5119]
+ psubd m12, m8, m12
+ psubd m10, m8, m10
+ psubd m7, m8, m7
+ psubd m5, m8, m5
+ REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
+ mova [r6+32*0], m12
+ mova [r6+32*1], m11
+ mova [r6+32*2], m10
+ mova [r6+32*3], m9
+ psrld m9, m15, 10 ; pd_2
+ psubd m3, m9, m3
+ paddd m2, m9
+ psubd m1, m9, m1
+ paddd m0, m9
+ psubd m12, m9, [r6-32*4]
+ paddd m11, m9, [r6-32*3]
+ psubd m10, m9, [r6-32*2]
+ paddd m9, [r6-32*1]
+ REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
+ mova [r6-32*4], m12
+ mova [r6-32*3], m11
+ mova [r6-32*2], m10
+ mova [r6-32*1], m9
+ add r6, 32*8
+ mova [r6-32*4], m7
+ mova [r6-32*3], m6
+ mova [r6-32*2], m5
+ mova [r6-32*1], m4
+ mova [r6+32*0], m3
+ mova [r6+32*1], m2
+ mova [r6+32*2], m1
+ mova [r6+32*3], m0
+.fast:
+ add r6, 32*8
+ call m(iadst_16x16_internal_10bpc).main
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
+ psrld m15, 10 ; pd_2
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ jmp m(iadst_16x16_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ mova [rsp+32*3], m3
+ mova [rsp+32*2], m2
+ mova [rsp+32*0], m0
+ mova m2, m13
+ mova m3, m12
+ vpbroadcastd m12, [pw_2048]
+ pxor m13, m13
+ psubw m13, m12
+ pmulhrsw m0, m13, m15
+ pmulhrsw m1, m12, m14
+ pmulhrsw m2, m13
+ pmulhrsw m3, m12
+ mova m14, m8
+ mova m15, m9
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m13, m11
+ pmulhrsw m1, m12, m10
+ pmulhrsw m2, m13, m15
+ pmulhrsw m3, m12, m14
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m13, m7
+ pmulhrsw m1, m12, m6
+ pmulhrsw m2, m13, m5
+ pmulhrsw m3, m12, m4
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m13, [rsp+32*3]
+ pmulhrsw m1, m12, [rsp+32*2]
+ pmulhrsw m2, m13, [rsp+32*1]
+ pmulhrsw m3, m12, [rsp+32*0]
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m15, [pd_5793]
+ vpbroadcastd m7, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ pmulld m0, m15, [cq+r3+32*33]
+ pmulld m1, m15, [cq+r3+32*35]
+ pmulld m2, m15, [cq+r3+32*37]
+ pmulld m3, m15, [cq+r3+32*39]
+ add r6, 32*4
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ pmulld m0, m15, [cq+64* 0]
+ pmulld m1, m15, [cq+64* 1]
+ pmulld m2, m15, [cq+64* 2]
+ pmulld m3, m15, [cq+64* 3]
+ pmulld m4, m15, [cq+64* 4]
+ pmulld m5, m15, [cq+64* 5]
+ pmulld m6, m15, [cq+64* 6]
+ pmulld m8, m15, [cq+64* 7]
+ mova [cq], m8
+ pmulld m8, m15, [cq+64* 8]
+ pmulld m9, m15, [cq+64* 9]
+ pmulld m10, m15, [cq+64*10]
+ pmulld m11, m15, [cq+64*11]
+ pmulld m12, m15, [cq+64*12]
+ pmulld m13, m15, [cq+64*13]
+ pmulld m14, m15, [cq+64*14]
+ pmulld m15, [cq+64*15]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [cq]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+
+ mova [cq+32*0], m15
+ mova [cq+32*1], m0
+ vpbroadcastd m15, [pw_1697x16]
+
+ REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14
+ mova m0, [cq+32*1]
+ mova [cq+32*1], m1
+ IDTX16 0, 1, 15
+ mova m1, [cq+32*0]
+ pmulhrsw m15, m1
+ paddsw m1, m1
+ paddsw m15, m1
+ mova m1, [cq+32*1]
+ jmp m(idct_16x16_internal_10bpc).end
+
+INV_TXFM_16X16_FN dct, dct, 0, 12
+INV_TXFM_16X16_FN dct, identity, 28, 12
+INV_TXFM_16X16_FN dct, adst, 0, 12
+INV_TXFM_16X16_FN dct, flipadst, 0, 12
+
+cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_16x16_internal_10bpc).pass1
+.pass2:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ mova [cq+32*12], m12
+ mova [cq+32*13], m13
+ mova [cq+32*14], m14
+ mova [cq+32*15], m15
+ call .pass2_main
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+ mov r5, r6
+ add r6, 32*16
+ call .pass2_main
+ jmp m(iadst_16x16_internal_12bpc).end
+ALIGN function_align
+.write_16x16:
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_16384]
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ jmp m(idct_16x16_internal_10bpc).write_16x16_2
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m2
+ mova [cq+32* 2], m4
+ mova [cq+32* 3], m6
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, m1
+ pmaxsd m1, m12, m3
+ pmaxsd m2, m12, m5
+ pmaxsd m3, m12, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ test eobd, eobd
+ jge .pass2_slow
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ jmp .pass2_fast
+.pass2_slow:
+ sub r6, 32*8
+ mova m8, [r6-32*4]
+ mova m4, [r6-32*3]
+ mova m10, [r6-32*2]
+ mova m5, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m6, [r6+32*1]
+ mova m14, [r6+32*2]
+ mova m7, [r6+32*3]
+ TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
+ mova [cq+32* 4], m8
+ mova [cq+32* 5], m10
+ mova [cq+32* 6], m12
+ mova [cq+32* 7], m14
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast:
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 1]
+ pmaxsd m2, m12, [cq+32* 2]
+ pmaxsd m3, m12, [cq+32* 3]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ test eobd, eobd
+ jge .pass2_slow2
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ jmp .pass2_fast2
+.pass2_slow2:
+ pmaxsd m4, m12, [cq+32* 4]
+ pmaxsd m5, m12, [cq+32* 5]
+ pmaxsd m6, m12, [cq+32* 6]
+ pmaxsd m7, m12, [cq+32* 7]
+ REPX {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast2:
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrad m11, 8 ; pd_8
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ret
+
+INV_TXFM_16X16_FN adst, dct, 0, 12
+INV_TXFM_16X16_FN adst, adst, 0, 12
+INV_TXFM_16X16_FN adst, flipadst, 0, 12
+
+cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iadst_16x16_internal_10bpc).pass1
+.pass2:
+ call .pass2_part1
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+ call .pass2_part2
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_part3:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+.end:
+ packssdw m15, m14
+ packssdw m14, m13, m12
+ packssdw m13, m11, m10
+ packssdw m12, m9, m8
+ packssdw m11, m7, m6
+ packssdw m10, m5, m4
+ packssdw m7, m3, m2
+ packssdw m6, m1, m0
+ vpblendd m0, m6, [r5-32*4], 0x33
+ vpblendd m1, m6, [r5-32*4], 0xcc
+ vpblendd m2, m7, [r5-32*3], 0x33
+ vpblendd m3, m7, [r5-32*3], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m10, [r5-32*2], 0x33
+ vpblendd m1, m10, [r5-32*2], 0xcc
+ vpblendd m2, m11, [r5-32*1], 0x33
+ vpblendd m3, m11, [r5-32*1], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m12, [r5+32*0], 0x33
+ vpblendd m1, m12, [r5+32*0], 0xcc
+ vpblendd m2, m13, [r5+32*1], 0x33
+ vpblendd m3, m13, [r5+32*1], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m14, [r5+32*2], 0x33
+ vpblendd m1, m14, [r5+32*2], 0xcc
+ vpblendd m2, m15, [r5+32*3], 0x33
+ vpblendd m3, m15, [r5+32*3], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass2_part1:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ mova [cq+32*12], m12
+ mova [cq+32*13], m13
+ mova [cq+32*14], m14
+ mova [cq+32*15], m15
+.pass2_main:
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m3
+ mova [cq+32* 2], m4
+ mova [cq+32* 3], m7
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ pmaxsd m0, m13, m2
+ pmaxsd m2, m13, m6
+ pmaxsd m5, m13, m5
+ pmaxsd m7, m13, m1
+ REPX {pminsd x, m14}, m0, m2, m5, m7
+ test eobd, eobd
+ jge .pass2_slow
+ pxor m1, m1
+ REPX {mova x, m1}, m3, m4, m6
+ jmp .pass2_fast
+.pass2_slow:
+ sub r6, 32*8
+ mova m8, [r6-32*4]
+ mova m3, [r6-32*3]
+ mova m4, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m1, [r6+32*1]
+ mova m6, [r6+32*2]
+ mova m15, [r6+32*3]
+ TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
+ mova [cq+32* 4], m8
+ mova [cq+32* 5], m11
+ mova [cq+32* 6], m12
+ mova [cq+32* 7], m15
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ REPX {pmaxsd x, m13}, m1, m3, m4, m6
+ REPX {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast:
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m7, m13, [cq+32* 1] ; 3
+ pmaxsd m2, m13, [cq+32* 2] ; 4
+ pmaxsd m5, m13, [cq+32* 3] ; 7
+ REPX {pminsd x, m14}, m0, m2, m5, m7
+ test eobd, eobd
+ jge .pass2_slow2
+ pxor m1, m1
+ REPX {mova x, m1}, m3, m4, m6
+ jmp .pass2_fast2
+.pass2_slow2:
+ pmaxsd m4, m13, [cq+32* 4] ; 8
+ pmaxsd m3, m13, [cq+32* 5] ; 11
+ pmaxsd m6, m13, [cq+32* 6] ; 12
+ pmaxsd m1, m13, [cq+32* 7] ; 15
+ REPX {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast2:
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_17408]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_17407
+ pslld m15, 3 ; pd_8
+ ret
+ALIGN function_align
+.pass2_part2:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+ mov r5, r6
+ add r6, 32*16
+ jmp .pass2_main
+
+INV_TXFM_16X16_FN flipadst, dct, 0, 12
+INV_TXFM_16X16_FN flipadst, adst, 0, 12
+INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
+
+cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iflipadst_16x16_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x16_internal_12bpc).pass2_part1
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ call m(iadst_16x16_internal_12bpc).pass2_part2
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_16x16_internal_12bpc).pass2_part3
+
+INV_TXFM_16X16_FN identity, dct, -92, 12
+INV_TXFM_16X16_FN identity, identity, 0, 12
+
+%macro IDTX16_12BPC 1 ; src
+ pmulld m6, m7, m%1
+ paddd m6, m15
+ psrad m6, 12
+ paddd m6, m%1
+ psrad m%1, m6, 1
+%endmacro
+
+cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [pd_1697]
+ vpbroadcastd m15, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ mova m10, [cq+r3+32*33]
+ mova m11, [cq+r3+32*35]
+ mova m12, [cq+r3+32*37]
+ mova m13, [cq+r3+32*39]
+ add r6, 32*4
+ pmulld m0, m7, m10
+ pmulld m1, m7, m11
+ pmulld m2, m7, m12
+ pmulld m3, m7, m13
+ REPX {paddd x, m15}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ paddd m0, m10
+ paddd m1, m11
+ paddd m2, m12
+ paddd m3, m13
+ REPX {psrad x, 1 }, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m8, [cq+64* 6]
+ mova m9, [cq+64* 7]
+ REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
+ mova [cq+64*0], m8
+ mova [cq+64*1], m9
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
+ mova m6, [cq+64*15]
+ pmulld m7, m6
+ paddd m7, m15
+ psrad m7, 12
+ paddd m7, m6
+ mova m6, [cq+64*0]
+ psrad m15, m7, 1
+ mova m7, [cq+64*1]
+ jmp tx2q
+.pass2:
+ call m(iidentity_8x16_internal_12bpc).pass2_main
+ call m(idct_16x16_internal_10bpc).transpose_fast
+ test eobd, eobd
+ jl .pass2_fast
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ mova m8, [r6-32*4]
+ mova m9, [r6-32*3]
+ mova m10, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m13, [r6+32*1]
+ mova m14, [r6+32*2]
+ mova m15, [r6+32*3]
+ sub r6, 32*8
+ mova m0, [r6-32*4]
+ mova m1, [r6-32*3]
+ mova m2, [r6-32*2]
+ mova m3, [r6-32*1]
+ mova m4, [r6+32*0]
+ mova m5, [r6+32*1]
+ mova m6, [r6+32*2]
+ mova m7, [r6+32*3]
+ call m(iidentity_8x16_internal_12bpc).pass2_main
+ call m(idct_16x8_internal_10bpc).transpose2
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+.pass2_fast:
+ call m(idct_16x16_internal_12bpc).write_16x16
+ RET
+
+%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
+ mova m%4, [r6+32*(%1-4)]
+ mova m%2, [r5+32*(3-%1)]
+ mova m%5, [r4+32*(%1-4)]
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%if %7 & 1
+ packssdw m%1, m%3 ; out0 + n, out16 + n
+ packssdw m%2, m%4 ; out15 - n, out31 - n
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vbroadcasti128 m14, [idct32_shuf]
+ mov r4, cq
+ call .pass1_main
+ mova [rsp+32*0], m2
+ mova [rsp+32*1], m3
+ cmp eobd, 43
+ jge .eob43
+ pxor m4, m4
+ REPX {mova x, m4}, [rsp+32*2], m2, m3, m11
+ jmp .pass1_end_fast
+.eob43:
+ lea r6, [rsp+32*8]
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ call .pass1_main
+ mova [rsp+32*2], m2
+ cmp eobd, 107
+ jge .eob107
+ mova m11, m3
+ mova m2, m0
+ mova m3, m1
+ mova m0, [r6-32*4]
+ mova m1, [r6-32*3]
+ pxor m4, m4
+.pass1_end_fast:
+ vpbroadcastd m10, [pw_2048]
+ lea r6, [deint_shuf+128]
+ REPX {mova x, m4}, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.eob107:
+ mova [rsp+32*3], m3
+ mova [r6-32*2], m0
+ mova [r6-32*1], m1
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ pshufd m12, m2, q1032
+ pshufd m13, m3, q1032
+ mova m4, m0
+ mova m5, m1
+ pxor m6, m6
+ REPX {mova x, m6}, m7, m14, m15
+ jmp .pass1_end
+.eob171:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ call .pass1_main
+ pshufd m12, [r6+32*2], q1032 ; out19 out17
+ pshufd m13, [r6+32*3], q1032 ; out23 out21
+ mova m4, [r6+32*0] ; out16 out18
+ mova m5, [r6+32*1] ; out20 out22
+ pshufd m14, m2, q1032 ; out27 out25
+ pshufd m15, m3, q1032 ; out31 out29
+ mova m6, m0 ; out24 out26
+ mova m7, m1 ; out28 out30
+.pass1_end:
+ mova m0, [r6-32*4] ; out0 out2
+ mova m1, [r6-32*3] ; out4 out6
+ mova m2, [r6-32*2] ; out8 out10
+ mova m3, [r6-32*1] ; out12 out14
+ lea r6, [deint_shuf+128]
+ mova m11, [rsp+32*3] ; out13 out15
+ vpbroadcastd m10, [pw_2048]
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+.end: ; [rsp+0*32] = m12
+ vpbroadcastd m12, [pw_2048]
+ mov cq, r4
+ mova [rsp+32*1], m8
+ mova [rsp+32*2], m9
+ mova [rsp+32*3], m10
+ mova [rsp+32*4], m11
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*1], q3120
+ vpermq m1, [rsp+32*2], q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*3], q3120
+ vpermq m1, [rsp+32*4], q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*0], q3120
+ vpermq m1, m13, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m14, q3120
+ vpermq m1, m15, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main_part1:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ call m(idct_8x8_internal_10bpc).main
+ psrld m1, m11, 10 ; pd_2
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.pass1_main:
+ call .pass1_main_part1
+ add cq, 32
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m6, m14
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ vperm2i128 m1, m0, m2, 0x31 ; 4 6
+ vinserti128 m0, xm2, 1 ; 0 2
+ vinserti128 m2, m3, xm4, 1 ; 1 3
+ vperm2i128 m3, m4, 0x31 ; 5 7
+ ret
+.main_oddhalf_part1_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_part1_fast: ; lower half zero
+ vpbroadcastd m7, [pd_4091]
+ vpbroadcastd m8, [pd_201]
+ vpbroadcastd m6, [pd_m1380]
+ vpbroadcastd m9, [pd_3857]
+ vpbroadcastd m5, [pd_3703]
+ vpbroadcastd m10, [pd_1751]
+ vpbroadcastd m4, [pd_m2751]
+ vpbroadcastd m15, [pd_3035]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ vpbroadcastd m15, [pd_4017]
+ vpbroadcastd m10, [pd_799]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r6-32*4], m0
+ mova [r6-32*3], m5
+ mova [r6-32*2], m4
+ mova [r6-32*1], m6
+ mova [r6+32*0], m3
+ mova [r6+32*1], m1
+ mova [r6+32*2], m8
+ mova [r6+32*3], m7
+ ret
+.main_oddhalf_part2_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_part2_fast: ; lower half zero
+ vpbroadcastd m7, [pd_m601]
+ vpbroadcastd m8, [pd_4052]
+ vpbroadcastd m6, [pd_3973]
+ vpbroadcastd m9, [pd_995]
+ vpbroadcastd m5, [pd_m2106]
+ vpbroadcastd m10, [pd_3513]
+ vpbroadcastd m4, [pd_3290]
+ vpbroadcastd m15, [pd_2440]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ vpbroadcastd m15, [pd_2276]
+ vpbroadcastd m10, [pd_3406]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
+ mova m9, [r6-32*4] ; t16a
+ mova m10, [r6-32*3] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r6-32*4], m9
+ mova [r6-32*3], m10
+ mova m9, [r6-32*2] ; t18a
+ mova m10, [r6-32*1] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r6-32*2], m9
+ mova [r6-32*1], m10
+ mova m9, [r6+32*0] ; t28
+ mova m10, [r6+32*1] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r6+32*0], m4
+ mova [r6+32*1], m1
+ mova m4, [r6+32*2] ; t30
+ mova m1, [r6+32*3] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r6+32*2], m0
+ mova [r6+32*3], m7
+ mov r4, r6
+ add r6, 32*8
+ mova [r6-32*4], m2
+ mova [r6-32*3], m5
+ mova [r6-32*2], m3
+ mova [r6-32*1], m6
+ mova [r6+32*0], m9
+ mova [r6+32*1], m10
+ mova [r6+32*2], m4
+ mova [r6+32*3], m1
+ mov r5, r6
+ add r6, 32*8
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2
+ IDCT32_END 1, 14, 8, 9, 10, 2
+ punpckhwd m8, m0, m1 ; 16 17
+ punpcklwd m0, m1 ; 0 1
+ punpcklwd m1, m14, m15 ; 14 15
+ punpckhwd m14, m15 ; 30 31
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 2
+ IDCT32_END 3, 14, 8, 9, 10, 2
+ punpckhwd m8, m2, m3 ; 18 19
+ punpcklwd m2, m3 ; 2 3
+ punpcklwd m3, m14, m15 ; 12 13
+ punpckhwd m14, m15 ; 28 29
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 2
+ IDCT32_END 5, 14, 8, 9, 10, 2
+ punpckhwd m8, m4, m5 ; 20 21
+ punpcklwd m4, m5 ; 4 5
+ punpcklwd m5, m14, m15 ; 10 11
+ punpckhwd m14, m15 ; 26 27
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 2
+ IDCT32_END 7, 14, 8, 9, 10, 2
+ punpckhwd m8, m6, m7 ; 22 23
+ punpcklwd m6, m7 ; 6 7
+ punpcklwd m7, m14, m15 ; 8 9
+ punpckhwd m14, m15 ; 24 25
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+.transpose:
+ punpckhdq m15, m3, m1
+ punpckldq m3, m1
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m7, m5
+ punpckldq m7, m5
+ punpcklqdq m5, m2, m15
+ punpckhqdq m2, m15
+ punpckhqdq m15, m7, m3
+ punpcklqdq m7, m3
+ punpckhqdq m3, m6, m1
+ punpcklqdq m6, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ vperm2i128 m4, m0, m7, 0x31
+ vinserti128 m0, xm7, 1
+ vperm2i128 m7, m3, m2, 0x31
+ vinserti128 m3, xm2, 1
+ vinserti128 m2, m6, xm5, 1
+ vperm2i128 m6, m5, 0x31
+ vperm2i128 m5, m1, m15, 0x31
+ vinserti128 m1, xm15, 1
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_5]
+ pxor m6, m6
+ mov r6d, eobd
+ add eobb, 21
+ cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 32
+ lea dstq, [dstq+strideq*8]
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ mova xm4, [dstq+strideq*0]
+ vinserti128 m4, [dstq+strideq*4], 1
+ paddw m0, m4
+ mova xm4, [dstq+strideq*1]
+ vinserti128 m4, [dstq+r5 ], 1
+ paddw m1, m4
+ mova xm4, [dstq+strideq*2]
+ vinserti128 m4, [dstq+r6*2 ], 1
+ paddw m2, m4
+ mova xm4, [dstq+r6 ]
+ vinserti128 m4, [dstq+r4 ], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*4], m0, 1
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+r5 ], m1, 1
+ mova [dstq+strideq*2], xm2
+ vextracti128 [dstq+r6*2 ], m2, 1
+ mova [dstq+r6 ], xm3
+ vextracti128 [dstq+r4 ], m3, 1
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ mov r4, cq
+ lea r6, [rsp+32*4]
+ call .pass1_main
+ cmp eobd, 43
+ jge .eob43
+ jmp .pass2_fast
+.eob43:
+ call .pass1_main
+ cmp eobd, 107
+ jge .eob107
+.pass2_fast:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ jmp .pass2_end
+.eob107:
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ jmp .pass2
+.eob171:
+ call .pass1_main
+.pass2:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ pmaxsd m4, m12, [cq+128*1+64]
+ pmaxsd m5, m12, [cq+128*7+64]
+ pmaxsd m6, m12, [cq+128*1+96]
+ pmaxsd m7, m12, [cq+128*7+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ pmaxsd m4, m12, [cq+128*3+64]
+ pmaxsd m5, m12, [cq+128*5+64]
+ pmaxsd m6, m12, [cq+128*3+96]
+ pmaxsd m7, m12, [cq+128*5+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ pmaxsd m4, m12, [cq+128*2+64]
+ pmaxsd m5, m12, [cq+128*6+64]
+ pmaxsd m6, m12, [cq+128*2+96]
+ pmaxsd m7, m12, [cq+128*6+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ pmaxsd m4, m12, [cq+128*0+64]
+ pmaxsd m5, m12, [cq+128*4+64]
+ pmaxsd m6, m12, [cq+128*0+96]
+ pmaxsd m7, m12, [cq+128*4+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+.pass2_end:
+ psrld m11, 8 ; pd_8
+ IDCT32_END 0, 15, 8, 9, 10, 4
+ IDCT32_END 1, 14, 8, 9, 10, 4
+ punpckhqdq m8, m0, m1 ; 16 17 (interleaved)
+ punpcklqdq m0, m1 ; 0 1 (interleaved)
+ punpcklqdq m1, m14, m15 ; 14 15 (interleaved)
+ punpckhqdq m14, m15 ; 30 31 (interleaved)
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 4
+ IDCT32_END 3, 14, 8, 9, 10, 4
+ punpckhqdq m8, m2, m3 ; 18 19 (interleaved)
+ punpcklqdq m2, m3 ; 2 3 (interleaved)
+ punpcklqdq m3, m14, m15 ; 12 13 (interleaved)
+ punpckhqdq m14, m15 ; 28 29 (interleaved)
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 4
+ IDCT32_END 5, 14, 8, 9, 10, 4
+ punpckhqdq m8, m4, m5 ; 20 21 (interleaved)
+ punpcklqdq m4, m5 ; 4 5 (interleaved)
+ punpcklqdq m5, m14, m15 ; 10 11 (interleaved)
+ punpckhqdq m14, m15 ; 26 27 (interleaved)
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 4
+ IDCT32_END 7, 14, 8, 9, 10, 4
+ punpckhqdq m8, m6, m7 ; 22 23 (interleaved)
+ punpcklqdq m6, m7 ; 6 7 (interleaved)
+ punpcklqdq m7, m14, m15 ; 8 9 (interleaved)
+ punpckhqdq m14, m15 ; 24 25 (interleaved)
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ mova m15, m1
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m2, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m7, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m3, q3120
+ vpermq m1, m15, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*3], q3120
+ vpermq m1, [r5+32*1], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*1], q3120
+ vpermq m1, [r5-32*3], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*4], q3120
+ vpermq m1, [r5-32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*0], q3120
+ vpermq m1, [r5+32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main:
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 32
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2, 0
+ mova [cq+32*16], m8
+ mova [cq+32*31], m9
+ IDCT32_END 1, 14, 8, 9, 10, 2, 0
+ mova [cq+32*17], m8
+ mova [cq+32*30], m9
+ mova [cq+32*14], m14
+ IDCT32_END 2, 14, 8, 9, 10, 2, 0
+ mova [cq+32*18], m8
+ mova [cq+32*29], m9
+ mova [cq+32*13], m14
+ IDCT32_END 3, 14, 8, 9, 10, 2, 0
+ mova [cq+32*19], m8
+ mova [cq+32*28], m9
+ mova [cq+32*12], m14
+ IDCT32_END 4, 14, 8, 9, 10, 2, 0
+ mova [cq+32*20], m8
+ mova [cq+32*27], m9
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ IDCT32_END 5, 10, 0, 1, 2, 2, 0
+ mova [cq+32*21], m0
+ mova [cq+32*26], m1
+ IDCT32_END 6, 9, 0, 1, 2, 2, 0
+ mova [cq+32*22], m0
+ mova [cq+32*25], m1
+ IDCT32_END 7, 8, 0, 1, 2, 2, 0
+ mova [cq+32*23], m0
+ mova [cq+32*24], m1
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 1]
+ mova m2, [cq+32* 2]
+ mova m11, m14
+ mova m12, [cq+32*12]
+ mova m13, [cq+32*13]
+ mova m14, [cq+32*14]
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm3
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ psubusw m1, m3
+ psubusw m2, m3
+ mova [dstq+32*0], m1
+ mova [dstq+32*1], m2
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ call .pass1
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ lea r6, [deint_shuf+128]
+ vpbroadcastd m11, [pw_2048]
+ mov r4, dstq
+ call .pass2
+ mova m0, [r5+32*3] ; 16 17
+ mova m1, [r5+32*2] ; 30 31
+ mova m2, [r5+32*1] ; 18 19
+ mova m3, [r5+32*0] ; 28 29
+ mova m4, [r5-32*1] ; 20 21
+ mova m5, [r5-32*2] ; 26 27
+ mova m6, [r5-32*3] ; 22 23
+ mova m7, [r5-32*4] ; 24 25
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ lea dstq, [r4+32]
+ call .pass2
+ RET
+ALIGN function_align
+.pass2:
+ call m(idct_16x8_internal_8bpc).main
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.pass1:
+ mova m0, [cq+32* 1]
+ mova m1, [cq+32* 7]
+ mova m2, [cq+32* 9]
+ mova m3, [cq+32*15]
+ mova m4, [cq+32*17]
+ mova m5, [cq+32*23]
+ mova m6, [cq+32*25]
+ mova m7, [cq+32*31]
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ mova m0, [cq+32* 3]
+ mova m1, [cq+32* 5]
+ mova m2, [cq+32*11]
+ mova m3, [cq+32*13]
+ mova m4, [cq+32*19]
+ mova m5, [cq+32*21]
+ mova m6, [cq+32*27]
+ mova m7, [cq+32*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ mova m0, [cq+32* 2]
+ mova m1, [cq+32* 6]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*14]
+ mova m4, [cq+32*18]
+ mova m5, [cq+32*22]
+ mova m6, [cq+32*26]
+ mova m7, [cq+32*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 4]
+ mova m2, [cq+32* 8]
+ mova m3, [cq+32*12]
+ mova m4, [cq+32*16]
+ mova m5, [cq+32*20]
+ mova m6, [cq+32*24]
+ mova m7, [cq+32*28]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_4096]
+ pxor m6, m6
+ mov r6d, eobd
+ add eobb, 21
+ cmovc eobd, r6d
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
+ add cq, 32*8
+ mova m2, [cq-32*4]
+ packssdw m2, [cq-32*3]
+ mova m3, [cq-32*2]
+ packssdw m3, [cq-32*1]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
+ call m(inv_txfm_add_identity_identity_8x32_10bpc).main
+ add dstq, 16
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
+ call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
+ mov r4, dstq
+ call m(idct_16x8_internal_12bpc).pass2_main
+ mova m0, [cq+32* 0] ; 16
+ mova m1, [cq+32* 1] ; 17
+ mova m2, [cq+32* 2] ; 18
+ mova m3, [cq+32* 3] ; 19
+ mova m4, [cq+32* 4] ; 20
+ mova m5, [cq+32* 5] ; 21
+ mova m6, [cq+32* 6] ; 22
+ mova m7, [cq+32* 7] ; 23
+ mova m8, [cq+32* 8] ; 24
+ mova m9, [cq+32* 9] ; 25
+ mova m10, [cq+32*10] ; 26
+ mova m11, [cq+32*11] ; 27
+ mova m12, [cq+32*12] ; 28
+ mova m13, [cq+32*13] ; 29
+ mova m14, [cq+32*14] ; 30
+ mova m15, [cq+32*15] ; 31
+ lea dstq, [r4+32]
+ call m(idct_16x8_internal_12bpc).pass2_main
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
+
+%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ mova m%4, [%2]
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
+%if %1 == 0
+ pxor m6, m6
+%endif
+ pmulhrsw m%3, m15
+ pmulhrsw m%1, m15
+ paddw m%3, [dstq+%5]
+ paddw m%1, [r2+%6]
+ pmaxsw m%3, m6
+ pmaxsw m%1, m6
+ pminsw m%3, m7
+ pminsw m%1, m7
+ mova [dstq+%5], m%3
+ mova [r2+%6], m%1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*16]
+ lea r4, [r6+32*8]
+ lea r5, [r6+32*16]
+ call .main
+ sub eobd, 44
+ jge .eob44
+ vperm2i128 m2, m0, m3, 0x31 ; 5
+ vinserti128 m0, xm3, 1 ; 1
+ vperm2i128 m3, m1, m4, 0x31 ; 7
+ vinserti128 m1, xm4, 1 ; 3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
+ jmp .fast
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+.eob44:
+ mova [r4+16*0], xm0
+ mova [r4+16*1], xm3
+ mova [r4+16*2], xm1
+ mova [r4+16*3], xm4
+ vextracti128 [r4+16*4], m0, 1
+ vextracti128 [r4+16*5], m3, 1
+ vextracti128 [r4+16*6], m1, 1
+ vextracti128 [r4+16*7], m4, 1
+ call .main
+ sub eobd, 107
+ jge .eob151
+ vperm2i128 m7, m1, m4, 0x31 ; 15
+ vinserti128 m5, m1, xm4, 1 ; 11
+ vperm2i128 m6, m0, m3, 0x31 ; 13
+ vinserti128 m4, m0, xm3, 1 ; 9
+ mova m0, [r4+32*0]
+ mova m1, [r4+32*1]
+ mova m2, [r4+32*2]
+ mova m3, [r4+32*3]
+.fast:
+ lea r6, [pw_5+128]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp .idct16
+.eob151:
+ mova [r4-16*8], xm0
+ mova [r4-16*7], xm3
+ mova [r4-16*6], xm1
+ mova [r4-16*5], xm4
+ vextracti128 [r4-16*4], m0, 1
+ vextracti128 [r4-16*3], m3, 1
+ vextracti128 [r4-16*2], m1, 1
+ vextracti128 [r4-16*1], m4, 1
+ call .main
+ sub eobd, 128
+ jge .eob279
+ vperm2i128 m10, m0, m3, 0x31 ; 21
+ vinserti128 m8, m0, xm3, 1 ; 17
+ vperm2i128 m11, m1, m4, 0x31 ; 23
+ vinserti128 m9, m1, xm4, 1 ; 19
+ pxor m12, m12
+ REPX {mova x, m12}, m13, m14, m15
+ REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
+ jmp .full
+.eob279:
+ mova [r5+16*0], xm0
+ mova [r5+16*1], xm3
+ mova [r5+16*2], xm1
+ mova [r5+16*3], xm4
+ vextracti128 [r5+16*4], m0, 1
+ vextracti128 [r5+16*5], m3, 1
+ vextracti128 [r5+16*6], m1, 1
+ vextracti128 [r5+16*7], m4, 1
+ call .main
+ vperm2i128 m14, m0, m3, 0x31 ; 29
+ vinserti128 m12, m0, xm3, 1 ; 25
+ vperm2i128 m15, m1, m4, 0x31 ; 31
+ vinserti128 m13, m1, xm4, 1 ; 27
+ mova m8, [r5+32*0]
+ mova m9, [r5+32*1]
+ mova m10, [r5+32*2]
+ mova m11, [r5+32*3]
+.full:
+ mova m0, [r4+32*0]
+ mova m1, [r4+32*1]
+ mova m2, [r4+32*2]
+ mova m3, [r4+32*3]
+ mova m4, [r4-32*4]
+ mova m5, [r4-32*3]
+ mova m6, [r4-32*2]
+ mova m7, [r4-32*1]
+ lea r6, [pw_5 + 128]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ lea r3, [rsp+32*8]
+ mova m8, [r3+32*0]
+ mova m9, [r3+32*1]
+ mova m10, [r3+32*2]
+ mova m11, [r3+32*3]
+ mova m12, [r3-32*4]
+ mova m13, [r3-32*3]
+ mova m14, [r3-32*2]
+ mova m15, [r3-32*1]
+.idct16:
+ lea r3, [rsp+32*16]
+ mova m0, [r3+32*0]
+ mova m1, [r3+32*1]
+ mova m2, [r3+32*2]
+ mova m3, [r3+32*3]
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*3]
+ mova m6, [r3-32*2]
+ mova m7, [r3-32*1]
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call .pass2_end
+ RET
+ALIGN function_align
+.main:
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128* 3]
+ pmulld m2, m14, [cq+128* 5]
+ pmulld m3, m14, [cq+128* 7]
+ pmulld m4, m14, [cq+128* 9]
+ pmulld m5, m14, [cq+128*11]
+ pmulld m6, m14, [cq+128*13]
+ pmulld m7, m14, [cq+128*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 2]
+ pmulld m2, m14, [cq+128* 4]
+ pmulld m3, m14, [cq+128* 6]
+ pmulld m4, m14, [cq+128* 8]
+ pmulld m5, m14, [cq+128*10]
+ pmulld m6, m14, [cq+128*12]
+ pmulld m7, m14, [cq+128*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m15, m11, 11 ; pd_1
+ mova m8, [r6-32*4]
+ mova m9, [r6-32*3]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m10, m0, m8 ; out15
+ paddd m0, m8 ; out0
+ mova m8, [r6-32*2]
+ paddd m15, m1, m9 ; out1
+ psubd m1, m9 ; out14
+ mova m9, [r6-32*1]
+ REPX {psrad x, 1}, m0, m15, m10, m1
+ packssdw m0, m15
+ packssdw m1, m10
+ psubd m10, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ mova m8, [r6+32*0]
+ paddd m15, m3, m9 ; out3
+ psubd m3, m9 ; out12
+ mova m9, [r6+32*1]
+ REPX {psrad x, 1}, m2, m15, m10, m3
+ packssdw m2, m15
+ packssdw m3, m10
+ psubd m10, m4, m8 ; out11
+ paddd m4, m8 ; out4
+ mova m8, [r6+32*2]
+ paddd m15, m5, m9 ; out5
+ psubd m5, m9 ; out10
+ mova m9, [r6+32*3]
+ REPX {psrad x, 1}, m4, m10, m15, m5
+ packssdw m4, m15
+ packssdw m5, m10
+ psubd m10, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ paddd m15, m7, m9 ; out7
+ psubd m7, m9 ; out8
+ REPX {psrad x, 1}, m6, m10, m15, m7
+ packssdw m6, m15
+ packssdw m7, m10
+ punpckhwd m8, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m3, m1
+ punpcklwd m3, m1
+ punpckhwd m1, m4, m6
+ punpcklwd m4, m6
+ punpcklwd m6, m7, m5
+ punpckhwd m7, m5
+ pxor m5, m5
+ mov r7d, 128*13
+.main_zero_loop:
+ mova [cq+r7-128*1], m5
+ mova [cq+r7+128*0], m5
+ mova [cq+r7+128*1], m5
+ mova [cq+r7+128*2], m5
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ punpcklwd m5, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m1
+ punpckhwd m4, m1
+ punpckhwd m1, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m6, m7
+ punpcklwd m6, m7
+ punpcklqdq m7, m1, m4
+ punpckhqdq m1, m4
+ punpckhqdq m4, m8, m3
+ punpcklqdq m8, m3
+ punpckhqdq m3, m6, m5
+ punpcklqdq m6, m5
+ punpcklqdq m5, m0, m2
+ punpckhqdq m0, m2
+ mova [r6+16*0], xm5
+ mova [r6+16*1], xm6
+ mova [r6+16*2], xm7
+ mova [r6+16*3], xm8
+ vextracti128 [r6+16*4], m5, 1
+ vextracti128 [r6+16*5], m6, 1
+ vextracti128 [r6+16*6], m7, 1
+ vextracti128 [r6+16*7], m8, 1
+ sub r6, 32*4
+ ret
+ALIGN function_align
+.pass2_end:
+ mova [rsp+gprsize+32*0], m6
+ mova [rsp+gprsize+32*2], m7
+ mova [rsp+gprsize+32*3], m15
+ vpbroadcastd m15, [pw_2048]
+ vpbroadcastd m7, [pixel_10bpc_max]
+ IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4
+ IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*1]
+ IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*0]
+ IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*2]
+ mova m2, [rsp+gprsize+32*3]
+ IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0
+ ret
+
+cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m8, [pw_2896x8]
+ vpbroadcastd m9, [pw_1697x16]
+ vpbroadcastd m11, [pw_8192]
+ lea r6, [strideq*5]
+ pxor m6, m6
+ paddw m10, m11, m11 ; pw_16384
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main2:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpcklwd m4, m2, m1
+ punpckhwd m2, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2
+ punpcklqdq m0, m3, m2
+ punpckhqdq m1, m3, m2
+ jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
+
+cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*4]
+ call .main
+ cmp eobd, 36
+ jge .full
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
+ lea r6, [pw_5+128]
+ mov r7, dstq
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
+ jmp .end
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
+.full:
+ add cq, 32
+ mova [r4+32*3], m0
+ mova [r4+32*2], m1
+ mova [r4+32*1], m2
+ mova [r4+32*0], m3
+ mova [r4-32*1], m4
+ mova [r4-32*2], m5
+ mova [r4-32*3], m6
+ mova [r4-32*4], m7
+ call .main
+ sub r4, 32*16 ; topleft 16x8
+ call .transpose_16x16
+ lea r6, [pw_5+128]
+ mov r7, dstq
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ add r4, 32*8 ; bottomleft 16x8
+ call .transpose_16x16
+.end:
+ lea dstq, [r7+32]
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ RET
+ALIGN function_align
+.transpose_16x16:
+ punpckhdq m8, m3, m1
+ punpckldq m3, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m7, m5
+ punpckldq m7, m5
+ punpckhdq m5, m4, m6
+ punpckldq m4, m6
+ punpckhqdq m6, m0, m4
+ punpcklqdq m0, m4
+ punpckhqdq m4, m1, m5
+ punpcklqdq m1, m5
+ punpckhqdq m5, m7, m3
+ punpcklqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ vinserti128 m8, m0, xm7, 1
+ vperm2i128 m12, m0, m7, 0x31
+ vinserti128 m9, m6, xm5, 1
+ vperm2i128 m13, m6, m5, 0x31
+ vinserti128 m10, m1, xm2, 1
+ vperm2i128 m14, m1, m2, 0x31
+ vinserti128 m11, m4, xm3, 1
+ vperm2i128 m15, m4, m3, 0x31
+ mova m0, [r4+32*3]
+ mova m1, [r4+32*2]
+ mova m2, [r4+32*1]
+ mova m3, [r4+32*0]
+ mova m4, [r4-32*1]
+ mova m5, [r4-32*2]
+ mova m6, [r4-32*3]
+ mova m7, [r4-32*4]
+ mova [rsp+gprsize], m15
+ jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ALIGN function_align
+.main:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ pmulld m0, m14, [cq+64* 1]
+ pmulld m1, m14, [cq+64* 7]
+ pmulld m2, m14, [cq+64* 9]
+ pmulld m3, m14, [cq+64*15]
+ pmulld m4, m14, [cq+64*17]
+ pmulld m5, m14, [cq+64*23]
+ pmulld m6, m14, [cq+64*25]
+ pmulld m7, m14, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
+ pmulld m0, m14, [cq+64* 3]
+ pmulld m1, m14, [cq+64* 5]
+ pmulld m2, m14, [cq+64*11]
+ pmulld m3, m14, [cq+64*13]
+ pmulld m4, m14, [cq+64*19]
+ pmulld m5, m14, [cq+64*21]
+ pmulld m6, m14, [cq+64*27]
+ pmulld m7, m14, [cq+64*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
+ pmulld m0, m14, [cq+64* 2]
+ pmulld m1, m14, [cq+64* 6]
+ pmulld m2, m14, [cq+64*10]
+ pmulld m3, m14, [cq+64*14]
+ pmulld m4, m14, [cq+64*18]
+ pmulld m5, m14, [cq+64*22]
+ pmulld m6, m14, [cq+64*26]
+ pmulld m7, m14, [cq+64*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+64* 0]
+ pmulld m1, m14, [cq+64* 4]
+ pmulld m2, m14, [cq+64* 8]
+ pmulld m3, m14, [cq+64*12]
+ pmulld m4, m14, [cq+64*16]
+ pmulld m5, m14, [cq+64*20]
+ pmulld m6, m14, [cq+64*24]
+ pmulld m7, m14, [cq+64*28]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ pxor m8, m8
+ mov r7d, 64*30
+.main_zero_loop:
+ mova [cq+r7-64*2], m8
+ mova [cq+r7-64*1], m8
+ mova [cq+r7+64*0], m8
+ mova [cq+r7+64*1], m8
+ sub r7d, 64*4
+ jg .main_zero_loop
+.main_end:
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1
+ IDCT32_END 1, 14, 8, 9, 10, 1
+ punpckhwd m8, m0, m1 ; 16 17
+ punpcklwd m0, m1 ; 0 1
+ punpcklwd m1, m14, m15 ; 14 15
+ punpckhwd m14, m15 ; 30 31
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 1
+ IDCT32_END 3, 14, 8, 9, 10, 1
+ punpckhwd m8, m2, m3 ; 18 19
+ punpcklwd m2, m3 ; 2 3
+ punpcklwd m3, m14, m15 ; 12 13
+ punpckhwd m14, m15 ; 28 29
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 1
+ IDCT32_END 5, 14, 8, 9, 10, 1
+ punpckhwd m8, m4, m5 ; 20 21
+ punpcklwd m4, m5 ; 4 5
+ punpcklwd m5, m14, m15 ; 10 11
+ punpckhwd m14, m15 ; 26 27
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 1
+ IDCT32_END 7, 14, 8, 9, 10, 1
+ punpckhwd m8, m6, m7 ; 22 23
+ punpcklwd m6, m7 ; 6 7
+ punpcklwd m7, m14, m15 ; 8 9
+ punpckhwd m14, m15 ; 24 25
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ ret
+ALIGN function_align
+.write_16x16:
+ mova m1, [rsp+gprsize+32*1]
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m12, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m12, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, [rsp+gprsize+32*0]
+ pmulhrsw m1, m12, [rsp+gprsize+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m12, m11
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, [rsp+gprsize+32*2]
+ pmulhrsw m1, m12, m13
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m12, m15
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+
+cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m8, [pw_2896x8]
+ vpbroadcastd m9, [pw_1697x16]
+ vpbroadcastd m10, [pw_4096]
+ lea r6, [strideq*5]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ REPX {paddsw x, x }, m0, m1, m2, m3
+ REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+
+cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.fast:
+ lea r4, [rsp+32*71]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r3, [rsp+32*3]
+ mov r4, r6
+ lea r5, [r6+32*8]
+ lea r6, [pw_5+128]
+ call .pass2_oddhalf
+ call .pass2_evenhalf
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ sub dstq, r3
+ lea r2, [r2+r3+32]
+ add dstq, 32
+ lea r3, [rsp+32*11]
+ call .pass2_oddhalf
+ call .pass2_evenhalf
+ lea r3, [strideq*3]
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 7]
+ mova m2, [cq+128* 9]
+ mova m3, [cq+128*15]
+ mova m4, [cq+128*17]
+ mova m5, [cq+128*23]
+ mova m6, [cq+128*25]
+ mova m7, [cq+128*31]
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128* 5]
+ mova m2, [cq+128*11]
+ mova m3, [cq+128*13]
+ mova m4, [cq+128*19]
+ mova m5, [cq+128*21]
+ mova m6, [cq+128*27]
+ mova m7, [cq+128*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128* 6]
+ mova m2, [cq+128*10]
+ mova m3, [cq+128*14]
+ mova m4, [cq+128*18]
+ mova m5, [cq+128*22]
+ mova m6, [cq+128*26]
+ mova m7, [cq+128*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mova m4, [cq+128*16]
+ mova m5, [cq+128*20]
+ mova m6, [cq+128*24]
+ mova m7, [cq+128*28]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ ret
+ALIGN function_align
+.pass2_oddhalf:
+ mova m0, [r3+32* 1] ; 1
+ mova m1, [r3+32* 3] ; 3
+ mova m2, [r3+32* 5] ; 5
+ mova m3, [r3+32* 7] ; 7
+ mova m4, [r3+32*17] ; 9
+ mova m5, [r3+32*19] ; 11
+ mova m6, [r3+32*21] ; 13
+ mova m7, [r3+32*23] ; 15
+ mova m8, [r3+32*33] ; 17
+ mova m9, [r3+32*35] ; 19
+ mova m10, [r3+32*37] ; 21
+ mova m11, [r3+32*39] ; 23
+ mova m12, [r3+32*49] ; 25
+ mova m13, [r3+32*51] ; 27
+ mova m14, [r3+32*53] ; 29
+ mova m15, [r3+32*55] ; 31
+ jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ALIGN function_align
+.pass2_evenhalf:
+ mova m0, [r3+32* 0] ; 0
+ mova m1, [r3+32* 2] ; 2
+ mova m2, [r3+32* 4] ; 4
+ mova m3, [r3+32* 6] ; 6
+ mova m4, [r3+32*16] ; 8
+ mova m5, [r3+32*18] ; 10
+ mova m6, [r3+32*20] ; 12
+ mova m7, [r3+32*22] ; 14
+ mova m8, [r3+32*32] ; 16
+ mova m9, [r3+32*34] ; 18
+ mova m10, [r3+32*36] ; 20
+ mova m11, [r3+32*38] ; 22
+ mova m12, [r3+32*48] ; 24
+ mova m13, [r3+32*50] ; 26
+ mova m14, [r3+32*52] ; 28
+ mova m15, [r3+32*54] ; 30
+ mova [rsp+gprsize], m15
+ jmp m(idct_16x16_internal_8bpc).main
+
+cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_8192]
+ pxor m6, m6
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8 ; 0 1
+ mov r7, dstq ; 1
+ add dstq, 16
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-32 ; 0 1 2
+ lea dstq, [r7+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-64 ; 0 1 2 3
+ add r7, 16*3 ; 1 2 3
+ mov dstq, r7 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-64 ; 0 1 2 3
+ lea dstq, [r7+strideq*8] ; 1 2 3 4
+ mov r7, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-32 ; 0 1 2 3
+ lea dstq, [r7+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+ call .main2 ; 3 4 5
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8 ; 0 1 2 3
+ add dstq, 16 ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8-32
+ lea dstq, [dstq+strideq*8-16]
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
+
+cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+ mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
+ mova m%4, [r4-32*(14+%1)] ; idct32 out31-n
+%else
+ mova m%5, [r4-32*(45-%1)]
+ mova m%4, [r5-32*(20+%1)]
+%endif
+ paddsw m%6, m%5, m%4 ; idct32 out 0+n
+ psubsw m%5, m%4 ; idct32 out31-n
+ paddsw m%4, m%5, m%3 ; out31-n
+ psubsw m%5, m%3 ; out32+n
+ paddsw m%3, m%6, m%2 ; out 0+n
+ psubsw m%6, m%2 ; out63-n
+ REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
+%if %1 & 1
+ %define %%d0 r2
+ %define %%d1 dstq
+%else
+ %define %%d0 dstq
+ %define %%d1 r2
+%endif
+ paddw m%3, [%%d0+%7 ]
+ paddw m%4, [%%d1+%8 ]
+ paddw m%5, [%%d0+%9 ]
+ paddw m%6, [%%d1+%10]
+ pxor m%2, m%2
+ REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
+ vpbroadcastd m%2, [pixel_10bpc_max]
+ REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6
+ mova [%%d0+%7 ], m%3
+ mova [%%d1+%8 ], m%4
+ mova [%%d0+%9 ], m%5
+ mova [%%d1+%10], m%6
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*6]
+ call .main
+ sub eobd, 44
+ jl .fast
+ call .main
+ sub eobd, 107
+ jl .fast
+ call .main
+ sub eobd, 128
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
+.fast:
+ lea r4, [rsp+32*38]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r6, [pw_5+128]
+ mova m0, [rsp+32* 2] ; in0
+ mova m1, [rsp+32* 6] ; in4
+ mova m2, [rsp+32*10] ; in8
+ mova m3, [rsp+32*14] ; in12
+ mova m4, [rsp+32*18] ; in16
+ mova m5, [rsp+32*22] ; in20
+ mova m6, [rsp+32*26] ; in24
+ mova m7, [rsp+32*30] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ lea r4, [rsp+32*38]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [rsp+32* 4] ; in2
+ mova m1, [rsp+32* 8] ; in6
+ mova m2, [rsp+32*12] ; in10
+ mova m3, [rsp+32*16] ; in14
+ mova m4, [rsp+32*20] ; in18
+ mova m5, [rsp+32*24] ; in22
+ mova m6, [rsp+32*28] ; in26
+ mova m7, [rsp+32*32] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [rsp+32* 3] ; in1
+ mova m1, [rsp+32*33] ; in31
+ mova m2, [rsp+32*19] ; in17
+ mova m3, [rsp+32*17] ; in15
+ mova m4, [rsp+32*11] ; in9
+ mova m5, [rsp+32*25] ; in23
+ mova m6, [rsp+32*27] ; in25
+ mova m7, [rsp+32* 9] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [rsp+32* 7] ; in5
+ mova m1, [rsp+32*29] ; in27
+ mova m2, [rsp+32*23] ; in21
+ mova m3, [rsp+32*13] ; in11
+ mova m4, [rsp+32*15] ; in13
+ mova m5, [rsp+32*21] ; in19
+ mova m6, [rsp+32*31] ; in29
+ mova m7, [rsp+32* 5] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+ call .main_part2_pass2
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ pxor m15, m15
+ mov r7d, 128*13
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ psrld m15, m11, 10 ; pd_2
+ mova m8, [r6-32*4]
+ mova m9, [r6+32*3]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m10, m0, m8 ; out15
+ paddd m0, m8 ; out0
+ mova m8, [r6-32*3]
+ psubd m15, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ mova m9, [r6+32*2]
+ REPX {psrad x, 2}, m0, m15, m10, m7
+ packssdw m0, m15
+ packssdw m7, m10
+ psubd m10, m1, m8 ; out14
+ paddd m1, m8 ; out1
+ mova m8, [r6-32*2]
+ psubd m15, m6, m9 ; out9
+ paddd m6, m9 ; out6
+ mova m9, [r6+32*1]
+ REPX {psrad x, 2}, m1, m15, m10, m6
+ packssdw m1, m15
+ packssdw m6, m10
+ psubd m10, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ mova m8, [r6-32*1]
+ psubd m15, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ mova m9, [r6+32*0]
+ REPX {psrad x, 2}, m2, m15, m10, m5
+ packssdw m2, m15
+ packssdw m5, m10
+ psubd m10, m3, m8 ; out12
+ paddd m3, m8 ; out3
+ psubd m15, m4, m9 ; out11
+ paddd m4, m9 ; out4
+ REPX {psrad x, 2}, m3, m15, m10, m4
+ packssdw m3, m15
+ packssdw m4, m10
+ call m(idct_16x8_internal_10bpc).transpose3
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ add r6, 32*8
+ ret
+.main_part2_pass2:
+ vpbroadcastd m11, [pw_1567_3784]
+ vpbroadcastd m12, [pw_m3784_1567]
+ vpbroadcastd m13, [pw_2896_2896]
+ lea r6, [pw_5+128]
+ lea r2, [dstq+r7]
+.main_part2_pass2_loop:
+ vpbroadcastd m14, [pw_m2896_2896]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
+ vpbroadcastd m14, [pw_2048]
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
+ add dstq, strideq
+ sub r2, strideq
+ cmp r4, r5
+ jne .main_part2_pass2_loop
+ ret
+ALIGN function_align
+.main_part1_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [r5+4*0]
+ vpbroadcastd m8, [r5+4*1]
+ vpbroadcastd m6, [r5+4*2]
+ vpbroadcastd m9, [r5+4*3]
+ vpbroadcastd m5, [r5+4*4]
+ vpbroadcastd m10, [r5+4*5]
+ vpbroadcastd m4, [r5+4*6]
+ vpbroadcastd m15, [r5+4*7]
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ vpbroadcastd m10, [r5+4*8]
+ vpbroadcastd m15, [r5+4*9]
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ vpbroadcastd m10, [r5+4*10]
+ vpbroadcastd m15, [r5+4*11]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r5, 4*12
+ mova [r6-32*4], m0
+ mova [r6+32*3], m7
+ mova [r6-32*3], m1
+ mova [r6+32*2], m8
+ mova [r6-32*2], m6
+ mova [r6+32*1], m4
+ mova [r6-32*1], m3
+ mova [r6+32*0], m5
+ add r6, 32*8
+ ret
+.main_part2: ; idct64 steps 6-9
+ lea r5, [r6+32*3]
+ sub r6, 32*4
+ vpbroadcastd m10, [pd_1567]
+ vpbroadcastd m15, [pd_3784]
+.main_part2_loop:
+ mova m0, [r6-32*32] ; t32a
+ mova m1, [r5-32*24] ; t39a
+ mova m2, [r5-32*32] ; t63a
+ mova m3, [r6-32*24] ; t56a
+ mova m4, [r6-32*16] ; t40a
+ mova m5, [r5-32* 8] ; t47a
+ mova m6, [r5-32*16] ; t55a
+ mova m7, [r6-32* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r5-32* 8], m2
+ mova [r6-32*32], m0
+ mova [r6-32* 8], m8
+ mova [r5-32*32], m1
+ mova [r5-32*24], m3
+ mova [r6-32*16], m6
+ mova [r6-32*24], m7
+ mova [r5-32*16], m5
+ add r6, 32
+ sub r5, 32
+ cmp r6, r5
+ jl .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*6]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 64
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
+.fast:
+ lea r4, [rsp+32*70]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r6, [pw_5 + 128]
+ mov r10, rsp
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+.pass2_loop:
+ mova m0, [r10+32* 2] ; in0
+ mova m1, [r10+32* 6] ; in4
+ mova m2, [r10+32*18] ; in8
+ mova m3, [r10+32*22] ; in12
+ mova m4, [r10+32*34] ; in16
+ mova m5, [r10+32*38] ; in20
+ mova m6, [r10+32*50] ; in24
+ mova m7, [r10+32*54] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ lea r4, [rsp+32*70]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [r10+32* 4] ; in2
+ mova m1, [r10+32* 8] ; in6
+ mova m2, [r10+32*20] ; in10
+ mova m3, [r10+32*24] ; in14
+ mova m4, [r10+32*36] ; in18
+ mova m5, [r10+32*40] ; in22
+ mova m6, [r10+32*52] ; in26
+ mova m7, [r10+32*56] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [r10+32* 3] ; in1
+ mova m1, [r10+32*57] ; in31
+ mova m2, [r10+32*35] ; in17
+ mova m3, [r10+32*25] ; in15
+ mova m4, [r10+32*19] ; in9
+ mova m5, [r10+32*41] ; in23
+ mova m6, [r10+32*51] ; in25
+ mova m7, [r10+32* 9] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [r10+32* 7] ; in5
+ mova m1, [r10+32*53] ; in27
+ mova m2, [r10+32*39] ; in21
+ mova m3, [r10+32*21] ; in11
+ mova m4, [r10+32*23] ; in13
+ mova m5, [r10+32*37] ; in19
+ mova m6, [r10+32*55] ; in29
+ mova m7, [r10+32* 5] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
+ add r10, 32*8
+ sub r4, 32*98 ; rsp+32*16
+ sub dstq, r8
+ add dstq, 32
+ cmp r10, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128* 7]
+ pmulld m2, m14, [cq+128* 9]
+ pmulld m3, m14, [cq+128*15]
+ pmulld m4, m14, [cq+128*17]
+ pmulld m5, m14, [cq+128*23]
+ pmulld m6, m14, [cq+128*25]
+ pmulld m7, m14, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
+ pmulld m0, m14, [cq+128* 3]
+ pmulld m1, m14, [cq+128* 5]
+ pmulld m2, m14, [cq+128*11]
+ pmulld m3, m14, [cq+128*13]
+ pmulld m4, m14, [cq+128*19]
+ pmulld m5, m14, [cq+128*21]
+ pmulld m6, m14, [cq+128*27]
+ pmulld m7, m14, [cq+128*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
+ pmulld m0, m14, [cq+128* 2]
+ pmulld m1, m14, [cq+128* 6]
+ pmulld m2, m14, [cq+128*10]
+ pmulld m3, m14, [cq+128*14]
+ pmulld m4, m14, [cq+128*18]
+ pmulld m5, m14, [cq+128*22]
+ pmulld m6, m14, [cq+128*26]
+ pmulld m7, m14, [cq+128*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 4]
+ pmulld m2, m14, [cq+128* 8]
+ pmulld m3, m14, [cq+128*12]
+ pmulld m4, m14, [cq+128*16]
+ pmulld m5, m14, [cq+128*20]
+ pmulld m6, m14, [cq+128*24]
+ pmulld m7, m14, [cq+128*28]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .normal
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+.dconly:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ vpbroadcastd m5, [dconly_10bpc]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm5
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ paddsw m3, m0, [dstq+32*2]
+ paddsw m4, m0, [dstq+32*3]
+ REPX {psubusw x, m5}, m1, m2, m3, m4
+ mova [dstq+32*0], m1
+ mova [dstq+32*1], m2
+ mova [dstq+32*2], m3
+ mova [dstq+32*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*4]
+ call .main
+ call .shift_transpose
+ cmp eobd, 36
+ jl .fast
+ call .main
+ call .shift_transpose
+ jmp .pass2
+.fast:
+ pxor m0, m0
+ mov r3d, 4
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ dec r3d
+ jg .fast_loop
+.pass2:
+ lea r7, [r6-32*64]
+ lea r4, [r6-32*32]
+ lea r6, [pw_5+128]
+ mov r5, dstq
+.pass2_loop:
+ mova m0, [r7-32*4]
+ mova m1, [r7-32*3]
+ mova m2, [r7-32*2]
+ mova m3, [r7-32*1]
+ mova m4, [r7+32*0]
+ mova m5, [r7+32*1]
+ mova m6, [r7+32*2]
+ mova m7, [r7+32*3]
+ add r7, 32*32
+ mova m8, [r7-32*4]
+ mova m9, [r7-32*3]
+ mova m10, [r7-32*2]
+ mova m11, [r7-32*1]
+ mova m12, [r7+32*0]
+ mova m13, [r7+32*1]
+ mova m14, [r7+32*2]
+ mova m15, [r7+32*3]
+ sub r7, 32*24
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16
+ add r5, 32
+ mov dstq, r5
+ cmp r7, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ mova m0, [cq+64* 2]
+ mova m1, [cq+64*14]
+ mova m2, [cq+64*18]
+ mova m3, [cq+64*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ mova m0, [cq+64* 6]
+ mova m1, [cq+64*10]
+ mova m2, [cq+64*22]
+ mova m3, [cq+64*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ mova m0, [cq+64* 4]
+ mova m1, [cq+64*12]
+ mova m2, [cq+64*20]
+ mova m3, [cq+64*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ pxor m15, m15
+ mov r7d, 64*30
+.main_zero_loop:
+ mova [cq+r7-64*2], m15
+ mova [cq+r7-64*1], m15
+ mova [cq+r7+64*0], m15
+ mova [cq+r7+64*1], m15
+ sub r7d, 64*4
+ jg .main_zero_loop
+.main_end:
+ psrld m15, m11, 10 ; pd_2
+.main_end2:
+ add cq, 32
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ add r6, 32*8
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ mova [r6+32*2], m1
+ mova [r6+32*1], m2
+ mova [r6+32*0], m3
+ mova [r6-32*1], m4
+ mova [r6-32*2], m5
+ mova [r6-32*3], m6
+ mova [r6-32*4], m7
+ jmp .main_end_loop_start
+.main_end_loop:
+ mova m0, [r6+32* 3] ; idct8 0 + n
+.main_end_loop_start:
+ mova m1, [r5+32* 4] ; idct16 15 - n
+ mova m2, [r5-32*12] ; idct32 16 + n
+ mova m3, [r6-32*13] ; idct32 31 - n
+ mova m4, [r6-32*29] ; idct64 63 - n
+ mova m5, [r5-32*28] ; idct64 48 + n
+ mova m6, [r6-32*45] ; idct64 47 - n
+ mova m7, [r5-32*44] ; idct64 32 + n
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r5-32*44], m2
+ mova [r6+32* 3], m1
+ mova [r6-32*45], m4
+ mova [r5+32* 4], m3
+ mova [r5-32*28], m5
+ mova [r6-32*13], m0
+ mova [r6-32*29], m6
+ mova [r5-32*12], m8
+ add r5, 32
+ sub r6, 32
+ cmp r5, r6
+ jl .main_end_loop
+ ret
+.shift_transpose:
+%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
+ sub r6, 32*48
+ mov r5, r6
+%%loop:
+ mova m0, [r6-32* 4]
+ mova m4, [r6+32* 4]
+ mova m1, [r6-32* 3]
+ mova m5, [r6+32* 5]
+ mova m2, [r6-32* 2]
+ mova m6, [r6+32* 6]
+ mova m3, [r6-32* 1]
+ mova m7, [r6+32* 7]
+ REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m4, [r6+32* 0]
+ mova m6, [r6+32* 8]
+ mova m5, [r6+32* 1]
+ mova m7, [r6+32* 9]
+ REPX {psrad x, %1}, m4, m6, m5, m7
+ packssdw m4, m6
+ packssdw m5, m7
+ mova m6, [r6+32* 2]
+ mova m8, [r6+32*10]
+ mova m7, [r6+32* 3]
+ mova m9, [r6+32*11]
+ REPX {psrad x, %1}, m6, m8, m7, m9
+ packssdw m6, m8
+ packssdw m7, m9
+ call m(idct_16x8_internal_10bpc).transpose3
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ add r6, 32*16
+ add r5, 32*8
+ cmp r5, r4
+ jl %%loop
+ mov r6, r4
+%endmacro
+ IDCT64_SHIFT_TRANSPOSE 2
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
+.fast:
+ pxor m0, m0
+ lea r4, [rsp+32*135]
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r7, [r6-32*32]
+ lea r5, [r6+32*8]
+ lea r6, [pw_5+128]
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+.pass2_loop:
+ mova m0, [r7-32*99]
+ mova m1, [r7-32*97]
+ mova m2, [r7-32*95]
+ mova m3, [r7-32*93]
+ mova m4, [r7-32*67]
+ mova m5, [r7-32*65]
+ mova m6, [r7-32*63]
+ mova m7, [r7-32*61]
+ mova m8, [r7-32*35]
+ mova m9, [r7-32*33]
+ mova m10, [r7-32*31]
+ mova m11, [r7-32*29]
+ mova m12, [r7-32* 3]
+ mova m13, [r7-32* 1]
+ mova m14, [r7+32* 1]
+ mova m15, [r7+32* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ mova m0, [r7-32*100]
+ mova m1, [r7-32*98]
+ mova m2, [r7-32*96]
+ mova m3, [r7-32*94]
+ mova m4, [r7-32*68]
+ mova m5, [r7-32*66]
+ mova m6, [r7-32*64]
+ mova m7, [r7-32*62]
+ mova m8, [r7-32*36]
+ mova m9, [r7-32*34]
+ mova m10, [r7-32*32]
+ mova m11, [r7-32*30]
+ mova m12, [r7-32* 4]
+ mova m13, [r7-32* 2]
+ mova m14, [r7+32* 0]
+ mova m15, [r7+32* 2]
+ add r7, 32*8
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ sub dstq, r3
+ lea r2, [r2+r3+32]
+ add dstq, 32
+ cmp r7, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128*31]
+ pmulld m2, m14, [cq+128*17]
+ pmulld m3, m14, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 7]
+ pmulld m1, m14, [cq+128*25]
+ pmulld m2, m14, [cq+128*23]
+ pmulld m3, m14, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 5]
+ pmulld m1, m14, [cq+128*27]
+ pmulld m2, m14, [cq+128*21]
+ pmulld m3, m14, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 3]
+ pmulld m1, m14, [cq+128*29]
+ pmulld m2, m14, [cq+128*19]
+ pmulld m3, m14, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ pmulld m0, m14, [cq+128* 2]
+ pmulld m1, m14, [cq+128*14]
+ pmulld m2, m14, [cq+128*18]
+ pmulld m3, m14, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2
+ pmulld m0, m14, [cq+128* 6]
+ pmulld m1, m14, [cq+128*10]
+ pmulld m2, m14, [cq+128*22]
+ pmulld m3, m14, [cq+128*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2
+ pmulld m0, m14, [cq+128* 4]
+ pmulld m1, m14, [cq+128*12]
+ pmulld m2, m14, [cq+128*20]
+ pmulld m3, m14, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 8]
+ pmulld m2, m14, [cq+128*16]
+ pmulld m3, m14, [cq+128*24]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ psrld m15, m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2
+ IDCT64_SHIFT_TRANSPOSE 1
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
+.fast:
+ pxor m0, m0
+ lea r4, [rsp+32*135]
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r10, [r6-32*32]
+ lea r6, [pw_5+128]
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+.pass2_loop:
+ mova m0, [r10-32*100] ; in0
+ mova m1, [r10-32*96] ; in4
+ mova m2, [r10-32*68] ; in8
+ mova m3, [r10-32*64] ; in12
+ mova m4, [r10-32*36] ; in16
+ mova m5, [r10-32*32] ; in20
+ mova m6, [r10-32* 4] ; in24
+ mova m7, [r10+32* 0] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [r10-32*98] ; in2
+ mova m1, [r10-32*94] ; in6
+ mova m2, [r10-32*66] ; in10
+ mova m3, [r10-32*62] ; in14
+ mova m4, [r10-32*34] ; in18
+ mova m5, [r10-32*30] ; in22
+ mova m6, [r10-32* 2] ; in26
+ mova m7, [r10+32* 2] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [r10-32*99] ; in1
+ mova m1, [r10+32* 3] ; in31
+ mova m2, [r10-32*35] ; in17
+ mova m3, [r10-32*61] ; in15
+ mova m4, [r10-32*67] ; in9
+ mova m5, [r10-32*29] ; in23
+ mova m6, [r10-32* 3] ; in25
+ mova m7, [r10-32*93] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [r10-32*95] ; in5
+ mova m1, [r10-32* 1] ; in27
+ mova m2, [r10-32*31] ; in21
+ mova m3, [r10-32*65] ; in11
+ mova m4, [r10-32*63] ; in13
+ mova m5, [r10-32*33] ; in19
+ mova m6, [r10+32* 1] ; in29
+ mova m7, [r10-32*97] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
+ add r10, 32*8
+ sub dstq, r8
+ sub r4, 32*44
+ add dstq, 32
+ cmp r10, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128*31]
+ mova m2, [cq+128*17]
+ mova m3, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 7]
+ mova m1, [cq+128*25]
+ mova m2, [cq+128*23]
+ mova m3, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 5]
+ mova m1, [cq+128*27]
+ mova m2, [cq+128*21]
+ mova m3, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128*29]
+ mova m2, [cq+128*19]
+ mova m3, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128*14]
+ mova m2, [cq+128*18]
+ mova m3, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ mova m0, [cq+128* 6]
+ mova m1, [cq+128*10]
+ mova m2, [cq+128*22]
+ mova m3, [cq+128*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ mova m0, [cq+128* 4]
+ mova m1, [cq+128*12]
+ mova m2, [cq+128*20]
+ mova m3, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 8]
+ mova m2, [cq+128*16]
+ mova m3, [cq+128*24]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx16_avx512.asm b/third_party/dav1d/src/x86/itx16_avx512.asm
new file mode 100644
index 0000000000..d973655462
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_avx512.asm
@@ -0,0 +1,4133 @@
+; Copyright © 2022-2023, VideoLAN and dav1d authors
+; Copyright © 2022-2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23
+ db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
+ db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
+ db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
+idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51
+ db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
+ db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17
+ db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25
+iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23
+ db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
+ db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19
+ db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
+permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13
+ db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
+ db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15
+ db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
+permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2
+ db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6
+ db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7
+ db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3
+permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6
+ db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14
+ db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7
+ db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15
+idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25
+ db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
+ db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29
+ db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
+idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30
+ db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
+ db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31
+ db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
+
+pw_2048_m2048: times 16 dw 2048
+pw_m2048_2048: times 16 dw -2048
+pw_2048: times 16 dw 2048
+
+; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-
+%macro COEF_PAIR 2-3 0 ; a, b, flags
+%if %3 == 1
+pd_%1_m%2: dd %1, %1, -%2, -%2
+%define pd_%1 (pd_%1_m%2 + 4*0)
+%define pd_m%2 (pd_%1_m%2 + 4*2)
+%elif %3 == 2
+pd_m%1_%2: dd -%1, -%1, %2, %2
+%define pd_m%1 (pd_m%1_%2 + 4*0)
+%define pd_%2 (pd_m%1_%2 + 4*2)
+%else
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3 == 3
+%define pd_%2_m%2 pd_%2
+dd -%2, -%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1189, 1
+COEF_PAIR 401, 1931
+COEF_PAIR 401, 3920
+COEF_PAIR 799, 2276, 1
+COEF_PAIR 799, 3406
+COEF_PAIR 799, 4017
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2598, 1931, 2
+COEF_PAIR 2598, 3612
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 3
+COEF_PAIR 2896, 3784, 3
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 1931
+COEF_PAIR 3166, 3612
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4017, 3406
+COEF_PAIR 4076, 1189
+COEF_PAIR 4076, 3612
+COEF_PAIR 4076, 3920
+COEF_PAIR 4091, 3973
+
+pb_32: times 4 db 32
+pw_5: times 2 dw 5
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_1697x16: times 2 dw 1697*16
+pw_2896x8: times 2 dw 2896*8
+pixel_10bpc_max: times 2 dw 0x03ff
+dconly_10bpc: times 2 dw 0x7c00
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+pd_1: dd 1
+pd_2: dd 2
+pd_1448: dd 1448
+pd_2048: dd 2048
+pd_3071: dd 3071 ; 1024 + 2048 - 1
+pd_3072: dd 3072 ; 1024 + 2048
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+
+cextern dup16_perm
+cextern int8_permA
+cextern idct_8x8_internal_8bpc_avx512icl.main
+cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_8x16_internal_8bpc_avx512icl.main
+cextern idct_8x16_internal_8bpc_avx512icl.main2
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_8x16_internal_8bpc_avx512icl.main2
+cextern idct_16x8_internal_8bpc_avx512icl.main
+cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_16x16_internal_8bpc_avx512icl.main
+cextern idct_16x16_internal_8bpc_avx512icl.main2
+cextern idct_16x16_internal_8bpc_avx512icl.main_fast
+cextern idct_16x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
+
+SECTION .text
+
+%define o_base (pw_2048+4*128)
+%define o_base_8bpc (int8_permA+64*18)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_ZMM avx512icl
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = inv_dst1, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %8 < 4096
+ vpbroadcastd m%3, [o(pd_%8)]
+%else
+ vbroadcasti32x4 m%3, [o(pd_%8)]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %7 < 4096
+ vpbroadcastd m%5, [o(pd_%7)]
+%else
+ vbroadcasti32x4 m%5, [o(pd_%7)]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+%if %9 & 1
+ psubd m%1, m%3, m%1
+%else
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
+cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_10bpc)
+ lea r5, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd ym2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw ym1, r6d
+ paddsw ym1, ym2
+.dconly_loop:
+ mova xm0, [dstq+strideq*0]
+ vinserti32x4 ym0, [dstq+strideq*1], 1
+ paddsw ym0, ym1
+ psubusw ym0, ym2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call .load
+ vpermi2q m1, m0, m2 ; 1 5
+ vpermi2q m3, m6, m4 ; 7 3
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ call .main_end
+ mova m4, [o(idct8x8p)]
+ packssdw m0, m2 ; 0 1 4 5
+ packssdw m1, m3 ; 3 2 7 6
+ vpermb m0, m4, m0
+ vprolq m1, 32
+ vpermb m2, m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ call m(idct_8x8_internal_8bpc).main
+ mova m10, [permC]
+ vpbroadcastd m12, [pw_2048]
+.end:
+ vpermt2q m0, m10, m1
+ vpermt2q m2, m10, m3
+.end2:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ pmulhrsw m8, m12, m0
+ call .write_8x4_start
+ pmulhrsw m8, m12, m2
+.write_8x4:
+ lea dstq, [dstq+strideq*4]
+ add cq, 64*2
+.write_8x4_start:
+ mova xm9, [dstq+strideq*0]
+ vinserti32x4 ym9, [dstq+strideq*1], 1
+ vinserti32x4 m9, [dstq+strideq*2], 2
+ vinserti32x4 m9, [dstq+r6 ], 3
+ mova [cq+64*0], m10
+ mova [cq+64*1], m10
+ paddw m9, m8
+ pmaxsw m9, m10
+ pminsw m9, m11
+ mova [dstq+strideq*0], xm9
+ vextracti32x4 [dstq+strideq*1], ym9, 1
+ vextracti32x4 [dstq+strideq*2], m9, 2
+ vextracti32x4 [dstq+r6 ], m9, 3
+ ret
+ALIGN function_align
+.load:
+ mova m0, [cq+64*0] ; 0 1
+ mova m4, [cq+64*1] ; 2 3
+ mova m1, [o(permB)]
+ mova m2, [cq+64*2] ; 4 5
+ mova m6, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m5, m1, 32
+ vpbroadcastd m12, [o(pd_2896)]
+ mova m3, m1
+ vpbroadcastd m11, [o(pd_1)]
+ ret
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m3, [o(pd_4017_3406)]
+ vbroadcasti32x4 m8, [o(pd_799_m2276)]
+ vbroadcasti32x4 m2, [o(pd_2896_3784)]
+ vbroadcasti32x4 m9, [o(pd_2896_1567)]
+ pmulld m3, m1 ; t4a t5a
+ pmulld m1, m8 ; t7a t6a
+ pmulld m2, m0 ; t0 t3
+ pmulld m0, m9 ; t1 t2
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276
+ ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
+.main2:
+ REPX {paddd x, m13}, m1, m3, m0, m2
+ REPX {psrad x, 12 }, m1, m3, m0, m2
+ punpcklqdq m8, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m8, m1 ; t5a t6a
+ paddd m8, m1 ; t4 t7
+ pmaxsd m3, m14
+ punpckhqdq m1, m2, m0 ; t3 t2
+ pminsd m3, m15
+ punpcklqdq m2, m0 ; t0 t1
+ pmulld m3, m12
+ paddd m0, m2, m1 ; dct4 out0 out1
+ psubd m2, m1 ; dct4 out3 out2
+ REPX {pmaxsd x, m14}, m8, m0, m2
+ REPX {pminsd x, m15}, m8, m0, m2
+.main3:
+ pshufd m1, m3, q1032
+ paddd m3, m13
+ psubd m9, m3, m1
+ paddd m3, m1
+ psrad m9, 12
+ psrad m3, 12
+ punpckhqdq m1, m8, m3 ; t7 t6
+ shufpd m8, m9, 0xaa ; t4 t5
+ ret
+.main_end:
+ paddd m0, m11
+ paddd m2, m11
+ psubd m3, m0, m1 ; out7 out6
+ paddd m0, m1 ; out0 out1
+ paddd m1, m2, m8 ; out3 out2
+ psubd m2, m8 ; out4 out5
+ REPX {vpsravd x, m11}, m0, m2, m3, m1
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+INV_TXFM_8X8_FN adst, adst
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ punpckldq m1, m2, m4 ; out4 out6
+ punpckhdq m2, m0 ; -out5 -out7
+ punpckldq m0, m3 ; out0 out2
+ punpckhdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.pass1_end:
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m0, m1 ; 0 2 4 6
+ packssdw m4, m3 ; 1 3 5 7
+ psrlq m1, [o(permB)], 8
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ psrlq m2, m1, 32
+ vpermi2q m1, m0, m3
+ vpermt2q m0, m2, m3
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ movu m10, [permC+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ jmp m(idct_8x8_internal_10bpc).end
+.main_pass2:
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ lea r5, [o_base_8bpc]
+ pshufd ym4, ym0, q1032
+ pshufd ym5, ym1, q1032
+ jmp m(iadst_8x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m14}, m4, m2, m0, m1
+ REPX {pminsd x, m15}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ shufpd m4, m2, 0xaa ; t4 t7
+ shufpd m2, m5, 0xaa ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m14}, m1, m2
+ REPX {pminsd x, m15}, m1, m2
+ shufpd m3, m1, m2, 0xaa
+ shufpd m1, m2, 0x55
+ pmulld m3, m12
+ pmulld m1, m12
+ paddd m3, m13
+ psubd m2, m3, m1
+ paddd m3, m1
+ psrad m2, 12 ; out4 -out5
+ pshufd m3, m3, q1032
+ psrad m3, 12 ; out2 -out3
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, identity
+INV_TXFM_8X8_FN flipadst, flipadst
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call m(iadst_8x8_internal_10bpc).main
+ punpckhdq m1, m3, m4 ; -out3 -out1
+ punpckldq m3, m0 ; out2 out0
+ punpckhdq m0, m2 ; -out7 -out5
+ punpckldq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_8x8_internal_10bpc).main_pass2
+ movu m10, [permC+1]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ lea r6, [strideq*3]
+ vpermt2q m0, m10, m1 ; 7 6 5 4
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m2, m10, m3 ; 3 2 1 0
+ pxor m10, m10
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m0
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ packssdw m1, [cq+64*2] ; 0 4 1 5
+ mova m2, [cq+64*1] ; 2 6 3 7
+ packssdw m2, [cq+64*3]
+ mova m0, [o(idtx8x8p)]
+ vpermb m1, m0, m1
+ vpermb m2, m0, m2
+ punpckldq m0, m1, m2 ; 0 1 4 5
+ punpckhdq m1, m2 ; 2 3 6 7
+ jmp tx2q
+.pass2:
+ movu m3, [o(permC+2)]
+ vpbroadcastd m12, [o(pw_4096)]
+ psrlq m2, m3, 32
+ vpermi2q m2, m0, m1
+ vpermt2q m0, m3, m1
+ jmp m(idct_8x8_internal_10bpc).end2
+
+%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, adst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call .load
+ call .main
+ call .main_end
+.pass1_end:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ jmp tx2q
+.pass2:
+ mova m8, [o(idct8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpcklqdq m8, m0, m2 ; 15 1
+ punpckhqdq m0, m2 ; 7 9
+ punpckhqdq m1, m5, m4 ; 3 13
+ punpcklqdq m5, m4 ; 11 5
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym7, m8, 1 ; 14 2
+ vextracti32x8 ym3, m0, 1 ; 6 10
+ vextracti32x8 ym6, m1, 1 ; 12 4
+ vextracti32x8 ym9, m5, 1 ; 8 0
+ call m(idct_8x16_internal_8bpc).main2
+ mova m8, [permC]
+ vpbroadcastd m12, [pw_2048]
+ vpermt2q m0, m8, m1
+ lea r6, [strideq*3]
+ vpermt2q m2, m8, m3
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m4, m8, m5
+ pxor m10, m10
+ vpermt2q m6, m8, m7
+ pmulhrsw m8, m12, m0
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*1]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*7]
+ mova ym7, [cq+64*3]
+ call .round_input_fast
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ movu m6, [o(permC+3)]
+ packssdw m3, m1, m3
+ packssdw m1, m0, m2
+ vprolq m3, 32
+ vpermd m1, m6, m1
+ vpermd m3, m6, m3
+ mova ym0, ym1 ; 0 4
+ vextracti32x8 ym1, m1, 1 ; 1 5
+ mova ym2, ym3 ; 2 6
+ vextracti32x8 ym3, m3, 1 ; 3 7
+ jmp tx2q
+ALIGN function_align
+.round_input_fast:
+ movshdup m8, [o(permB)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpermt2q m0, m8, m4
+ vpermt2q m1, m8, m5
+ vpermt2q m2, m8, m6
+ vpermt2q m3, m8, m7
+ vpbroadcastd m13, [o(pd_2048)]
+ REPX {pmulld x, m12}, m0, m1, m2, m3
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m11, [o(pd_1)]
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+ALIGN function_align
+.load:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+.load2:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m0, m12, [cq+64*0]
+ pmulld m1, m12, [cq+64*1]
+ pmulld m2, m12, [cq+64*2]
+ pmulld m3, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pmulld m4, m12, [cq+64*4]
+ pmulld m5, m12, [cq+64*5]
+ pmulld m6, m12, [cq+64*6]
+ pmulld m7, m12, [cq+64*7]
+.round:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ REPX {paddd x, m13}, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.main_fast_rect2:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_fast:
+ pmulld m0, m12
+ pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a
+ pmulld m3, [o(pd_3406)] {1to16} ; t6a
+ pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a
+ pmulld m1, [o(pd_799)] {1to16} ; t4a
+ pmulld m6, m2, [o(pd_3784)] {1to16} ; t3
+ pmulld m2, [o(pd_1567)] {1to16} ; t2
+ paddd m0, m13
+ psubd m5, m13, m5
+ psrad m0, 12 ; t0
+ mova m9, m0 ; t1
+ jmp .main2
+.main_rect2:
+ call .round
+.main:
+ pmulld m0, m12
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ pmulld m4, m12
+ paddd m0, m13
+ paddd m5, m13
+ psubd m9, m0, m4 ; t1
+ paddd m0, m4 ; t0
+ psrad m9, 12
+ psrad m0, 12
+.main2:
+ REPX {paddd x, m13}, m3, m1, m7
+ REPX {psrad x, 12 }, m5, m1, m3, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ psubd m5, m7, m3 ; t6a
+ paddd m7, m3 ; t7
+ pmaxsd m5, m14
+ pmaxsd m1, m14
+ paddd m2, m13
+ paddd m6, m13
+ pminsd m5, m15
+ pminsd m1, m15
+ pmulld m5, m12
+ pmulld m1, m12
+ pmaxsd m8, m14
+ pmaxsd m7, m14
+ pminsd m8, m15
+ paddd m5, m13
+ psubd m4, m5, m1
+ paddd m5, m1
+ REPX {psrad x, 12 }, m2, m6, m5, m4
+ paddd m1, m9, m2 ; dct4 out1
+ psubd m2, m9, m2 ; dct4 out2
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ pminsd m6, m15, m7
+ REPX {pmaxsd x, m14}, m0, m1, m2, m3
+ REPX {pminsd x, m15}, m0, m1, m2, m3
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_1)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ psubd m6, m1, m5 ; out6
+ paddd m1, m5 ; out1
+ psubd m5, m2, m4 ; out5
+ paddd m2, m4 ; out2
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, identity, 35
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, adst
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call .main
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m10, 1
+ psrad m7, m11, 1
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call .fast_main
+ punpcklqdq m1, m2, m4 ; out4 out6
+ punpckhqdq m2, m0 ; -out5 -out7
+ punpcklqdq m0, m3 ; out0 out2
+ punpckhqdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.fast_end:
+ movu m5, [o(permC+3)]
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m2, m0, m1 ; 0 2 4 6
+ packssdw m3, m4, m3 ; 1 3 5 7
+ vpermd m2, m5, m2
+ vpermd m3, m5, m3
+ mova ym0, ym2
+ vextracti32x8 ym2, m2, 1
+ mova ym1, ym3
+ vextracti32x8 ym3, m3, 1
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ movu m4, [permB+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ psrlq m7, m4, 8
+ vpermi2q m4, m0, m3 ; 0 1 2 3
+ psrlq m5, m7, 24
+ vpermi2q m7, m0, m3 ; 12 13 14 15
+ psrlq m6, m5, 8
+ vpermq m5, m5, m1 ; 4 5 6 7
+ vpermq m6, m6, m2 ; 8 9 10 11
+.pass2_end:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ pxor m10, m10
+ lea r6, [strideq*3]
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m5
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m7
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
+ vpbroadcastd m10, [o(pd_1567)]
+ vpbroadcastd m11, [o(pd_3784)]
+ ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
+ vpbroadcastd m12, [o(pd_1448)]
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m14}, m5, m3, m2, m9
+ REPX {pminsd x, m15}, m5, m3, m2, m9
+ REPX {pmulld x, m12}, m5, m3, m2, m9
+ vpbroadcastd m4, [o(pd_1)]
+ psubd m8, m5, m3 ; (t2 - t3) * 1448
+ paddd m3, m5 ; (t2 + t3) * 1448
+ psubd m5, m2, m9 ; (t6 - t7) * 1448
+ paddd m2, m9 ; (t6 + t7) * 1448
+ vpbroadcastd m9, [o(pd_3072)]
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m10, m6, m4
+ psubd m11, m4, m7
+ paddd m2, m9
+ paddd m8, m9
+ vpbroadcastd m9, [o(pd_3071)]
+ psubd m3, m9, m3
+ psubd m9, m5
+ ret
+ALIGN function_align
+.fast_main:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*7]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*3]
+ mova ym7, [cq+64*1]
+ call m(idct_8x16_internal_10bpc).round_input_fast
+ jmp m(iadst_8x8_internal_10bpc).main
+ALIGN function_align
+.pass2_main:
+ mova m8, [o(iadst8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ vpbroadcastd m10, [o(pw_2896x8)]
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m2, m3
+ punpckldq m2, m3
+ lea r5, [o_base_8bpc]
+ punpckhqdq m4, m0, m2 ; 12 3 14 1
+ punpcklqdq m0, m2 ; 0 15 2 13
+ punpckhqdq m6, m5, m1 ; 8 7 10 5
+ punpcklqdq m5, m1 ; 4 11 6 9
+ call m(iadst_8x16_internal_8bpc).main2
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m10 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m10 ; out8 -out11 -out9 out10
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, identity, 35
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call m(iadst_8x16_internal_10bpc).main
+ psrad m7, m0, 1
+ psrad m0, m11, 1
+ psrad m6, m1, 1
+ psrad m1, m10, 1
+ psrad m5, m2, 12
+ psrad m2, m9, 12
+ psrad m4, m3, 12
+ psrad m3, m8, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_8x16_internal_10bpc).fast_main
+ punpckhqdq m1, m3, m4 ; -out3 -out1
+ punpcklqdq m3, m0 ; out2 out0
+ punpckhqdq m0, m2 ; -out7 -out5
+ punpcklqdq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x16_internal_10bpc).fast_end
+.pass2:
+ call m(iadst_8x16_internal_10bpc).pass2_main
+ movu m7, [permB+2]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ psrlq m4, m7, 8
+ vpermi2q m7, m3, m0 ; 3 2 1 0
+ psrlq m5, m4, 24
+ vpermi2q m4, m3, m0 ; 15 14 13 12
+ psrlq m6, m5, 8
+ vpermq m5, m5, m2 ; 11 10 9 8
+ vpermq m6, m6, m1 ; 7 6 5 4
+ jmp m(iadst_8x16_internal_10bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m7, [o(pw_2048)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ vpbroadcastd m6, [o(pixel_10bpc_max)]
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ pxor m5, m5
+ punpckhqdq m1, m0, m2 ; 1 5 9 13
+ punpcklqdq m0, m2 ; 0 4 8 12
+ punpcklqdq m2, m3, m4 ; 2 6 10 14
+ punpckhqdq m3, m4 ; 3 7 11 15
+ lea r6, [strideq*3]
+ pmulhrsw m0, m7
+ call .write_8x4_start
+ pmulhrsw m0, m7, m1
+ call .write_8x4
+ pmulhrsw m0, m7, m2
+ call .write_8x4
+ pmulhrsw m0, m7, m3
+.write_8x4:
+ add dstq, strideq
+ add cq, 64*2
+.write_8x4_start:
+ mova xm4, [dstq+strideq*0]
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ vinserti32x4 m4, [dstq+strideq*8], 2
+ vinserti32x4 m4, [dstq+r6*4 ], 3
+ mova [cq+64*0], m5
+ mova [cq+64*1], m5
+ paddw m4, m0
+ pmaxsw m4, m5
+ pminsw m4, m6
+ mova [dstq+strideq*0], xm4
+ vextracti32x4 [dstq+strideq*4], ym4, 1
+ vextracti32x4 [dstq+strideq*8], m4, 2
+ vextracti32x4 [dstq+r6*4 ], m4, 3
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd m2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m1, r6d
+ paddsw m1, m2
+.dconly_loop:
+ mova ym0, [dstq+strideq*0]
+ vinserti32x8 m0, [dstq+strideq*1], 1
+ paddsw m0, m1
+ psubusw m0, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, -21
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, adst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m4, m12, [cq+64*0] ; 0 1
+ pmulld m9, m12, [cq+64*1] ; 2 3
+ pmulld m8, m12, [cq+64*2] ; 4 5
+ pmulld m7, m12, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m2, m2
+ mova m15, [o(permB)]
+ REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
+ psrlq m0, m15, 32
+ REPX {paddd x, m13}, m4, m9, m8, m7
+ vpbroadcastd m14, [o(clip_18b_min)]
+ REPX {psrad x, 12 }, m4, m8, m9, m7
+ mova m1, m0
+ vpermi2q m0, m4, m8 ; 0 4
+ cmp eobd, 43
+ jl .fast
+ pmulld m5, m12, [cq+64*4] ; 8 9
+ pmulld m10, m12, [cq+64*5] ; 10 11
+ pmulld m11, m12, [cq+64*6] ; 12 13
+ pmulld m6, m12, [cq+64*7] ; 14 15
+ REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m5, m10, m11, m6
+ REPX {psrad x, 12 }, m10, m5, m11, m6
+ mova m2, m1
+ vpermi2q m1, m9, m10 ; 2 10
+ mova m3, m2
+ vpermi2q m2, m5, m11 ; 8 12
+ vpermi2q m3, m6, m7 ; 14 6
+ vpermt2q m4, m15, m11 ; 1 13
+ vpermt2q m6, m15, m9 ; 15 3
+ vpermt2q m5, m15, m8 ; 9 5
+ vpermt2q m7, m15, m10 ; 7 11
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main
+ call .main
+ jmp .pass1_end
+.fast:
+ vpermi2q m1, m9, m7 ; 2 6
+ vpermt2q m4, m15, m9 ; 1 3
+ vpermt2q m7, m15, m8 ; 7 5
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main_fast
+ call .main_fast
+.pass1_end:
+ call m(idct_8x16_internal_10bpc).main_end
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+.pass1_end2:
+ mova m10, m9
+ mova m11, m8
+ call .transpose_16x8
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x8_internal_8bpc).main
+ movshdup m4, [permC]
+ vpbroadcastd m11, [pw_2048]
+ psrlq m5, m4, 8
+.end:
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ vpermq m8, m4, m0
+ vpermq m9, m5, m1
+ lea r6, [strideq*3]
+ call .write_16x4
+ vpermq m8, m4, m2
+ vpermq m9, m5, m3
+.write_16x4:
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+.write_16x4_noround:
+ mova ym10, [dstq+strideq*0]
+ vinserti32x8 m10, [dstq+strideq*1], 1
+ paddw m8, m10
+ mova ym10, [dstq+strideq*2]
+ vinserti32x8 m10, [dstq+r6 ], 1
+ paddw m9, m10
+ pmaxsw m8, m12
+ pmaxsw m9, m12
+ pminsw m8, m13
+ pminsw m9, m13
+ mova [dstq+strideq*0], ym8
+ vextracti32x8 [dstq+strideq*1], m8, 1
+ mova [dstq+strideq*2], ym9
+ vextracti32x8 [dstq+r6 ], m9, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ vbroadcasti32x4 m5, [o(pd_m2598_1931)]
+ vbroadcasti32x4 m9, [o(pd_3166_3612)]
+ pmulld m6, m4 ; t15a t12a
+ pmulld m4, m3 ; t8a t11a
+ pmulld m5, m7 ; t9a t10a
+ pmulld m7, m9 ; t14a t13a
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189
+ ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
+.main2:
+ REPX {paddd x, m13}, m4, m6, m5, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ paddd m9, m4, m5 ; t8 t11
+ psubd m4, m5 ; t9 t10
+ psubd m5, m6, m7 ; t14 t13
+ paddd m6, m7 ; t15 t12
+ REPX {pmaxsd x, m14}, m5, m4, m9, m6
+ REPX {pminsd x, m15}, m5, m4, m9, m6
+.main3:
+ psubd m3, m0, m1 ; dct8 out7 out6
+ paddd m0, m1 ; dct8 out0 out1
+ vbroadcasti32x4 m7, [o(pd_3784_m3784)]
+ pmulld m7, m5
+ vpmulld m5, [o(pd_1567)] {1to16}
+ paddd m1, m2, m8 ; dct8 out3 out2
+ psubd m2, m8 ; dct8 out4 out5
+ vbroadcasti32x4 m8, [o(pd_1567_m1567)]
+ pmulld m8, m4
+ vpmulld m4, [o(pd_3784)] {1to16}
+ REPX {pmaxsd x, m14}, m0, m1
+ REPX {pminsd x, m15}, m0, m1
+ paddd m7, m13
+ paddd m5, m13
+ paddd m7, m8
+ psubd m5, m4
+ psrad m7, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ punpckhqdq m4, m9, m7
+ punpcklqdq m8, m9, m5
+ punpckhqdq m5, m6, m5
+ punpcklqdq m6, m7
+ psubd m7, m8, m4 ; t11a t10
+ paddd m8, m4 ; t8a t9
+ psubd m4, m6, m5 ; t12a t13
+ paddd m6, m5 ; t15a t14
+ REPX {pmaxsd x, m14}, m4, m7
+ REPX {pminsd x, m15}, m4, m7
+ pmulld m4, m12
+ pmulld m7, m12
+ REPX {pmaxsd x, m14}, m2, m3, m6, m8
+ REPX {pminsd x, m15}, m2, m3, m6, m8
+ paddd m4, m13
+ paddd m5, m4, m7
+ psubd m4, m7
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ ret
+ALIGN function_align
+.transpose_16x8:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpermi2d m8, m0, m2
+ vpermt2d m0, m9, m2
+ vpermi2d m10, m1, m3
+ vpermi2d m11, m1, m3
+ punpckhwd m3, m8, m0
+ punpcklwd m1, m8, m0
+ punpckhwd m4, m10, m11
+ punpcklwd m2, m10, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, identity, -21
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, adst
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ call .main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp m(idct_16x8_internal_10bpc).pass1_end2
+.pass2:
+ call .main_pass2
+ vpermq m8, m11, m0
+ vpermq m9, m11, m1
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ vpermq m8, m11, m2
+ vpermq m9, m11, m3
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m2, m12, [cq+64*0]
+ pmulld m7, m12, [cq+64*1]
+ pmulld m1, m12, [cq+64*2]
+ pmulld m5, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m4, m4
+ mova m10, [o(permB)]
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ REPX {paddd x, m13}, m2, m7, m1, m5
+ psrlq m6, m10, 32
+ REPX {psrad x, 12 }, m2, m7, m1, m5
+ mova m0, m6
+ vpermi2q m0, m2, m7 ; 0 2
+ vpermt2q m7, m10, m2 ; 3 1
+ mova m2, m6
+ vpermi2q m2, m1, m5 ; 4 6
+ vpermt2q m5, m10, m1 ; 7 5
+ cmp eobd, 43
+ jl .main_fast
+ pmulld m8, m12, [cq+64*4]
+ pmulld m3, m12, [cq+64*5]
+ pmulld m9, m12, [cq+64*6]
+ pmulld m1, m12, [cq+64*7]
+ REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m8, m3, m9, m1
+ REPX {psrad x, 12 }, m8, m3, m9, m1
+ mova m4, m6
+ vpermi2q m4, m8, m3 ; 8 10
+ vpermt2q m3, m10, m8 ; 11 9
+ vpermi2q m6, m9, m1 ; 12 14
+ vpermt2q m1, m10, m9 ; 15 13
+.main:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601
+ jmp .main2
+.main_fast:
+ vbroadcasti32x4 m1, [o(pd_4091_3973)]
+ vbroadcasti32x4 m8, [o(pd_201_995)]
+ vbroadcasti32x4 m3, [o(pd_3703_3290)]
+ vbroadcasti32x4 m9, [o(pd_1751_2440)]
+ vbroadcasti32x4 m4, [o(pd_2751_2106)]
+ vbroadcasti32x4 m10, [o(pd_3035_3513)]
+ vbroadcasti32x4 m6, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m1, m0
+ pmulld m0, m8
+ pmulld m3, m2
+ pmulld m2, m9
+ pmulld m4, m5
+ pmulld m5, m10
+ pmulld m6, m7
+ pmulld m7, m11
+.main2:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {psubd x, m13, x}, m1, m3
+ REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m14}, m8, m4, m5, m6
+ REPX {pminsd x, m15}, m8, m4, m5, m6
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m2, m1, m3
+ REPX {pminsd x, m15}, m0, m2, m1, m3
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m14}, m7, m3, m2, m6
+ REPX {pminsd x, m15}, m7, m3, m2, m6
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ vpbroadcastd m11, [o(pd_1567)]
+ vpbroadcastd m10, [o(pd_3784)]
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m4, m1, m8
+ REPX {pminsd x, m15}, m0, m4, m1, m8
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m14}, m6, m5, m3, m4
+ mov r6d, 0x3333
+ REPX {pminsd x, m15}, m6, m5, m3, m4
+ kmovw k1, r6d
+ REPX {pmulld x, m12}, m6, m5, m3, m4
+ pxor m9, m9
+ REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
+ paddd m6, m13
+ paddd m4, m13
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
+ ret
+ALIGN function_align
+.main_pass2:
+ lea r5, [o_base_8bpc]
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m11, [permC]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ lea r6, [strideq*3]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, identity, -21
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_10bpc).main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_16x8_internal_10bpc).main_pass2
+ psrlq m11, 8
+ vpermq m8, m11, m3
+ vpermq m9, m11, m2
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ vpermq m8, m11, m1
+ vpermq m9, m11, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ vpbroadcastd m8, [o(pd_5793)]
+ vpbroadcastd m13, [o(pd_3072)]
+ pxor m10, m10
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).round
+ psrlq m8, [o(permA)], 16
+ psrlq m9, m8, 8
+ mova m10, m8
+ mova m11, m9
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ jmp tx2q
+.pass2:
+ movshdup m4, [o(permC)]
+ vpbroadcastd m11, [o(pw_4096)]
+ mova m5, m4
+ jmp m(idct_16x8_internal_10bpc).end
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, adst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+64* 1]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m22, [cq+64*13]
+ mova m23, [cq+64*15]
+ call .main
+ call .main_end
+.pass1_end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+.pass1_end2:
+ call .main_end3
+.pass1_end3:
+ mov r6d, 64*12
+ pxor m8, m8
+.zero_loop:
+ mova [cq+r6+64*3], m8
+ mova [cq+r6+64*2], m8
+ mova [cq+r6+64*1], m8
+ mova [cq+r6+64*0], m8
+ sub r6d, 64*4
+ jge .zero_loop
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x16_internal_8bpc).main
+ movshdup m12, [permC]
+ vpbroadcastd m11, [pw_2048]
+ psrlq m13, m12, 8
+ vpermq m8, m12, m0
+ vpermq m0, m13, m7
+ vpermq m7, m13, m1
+ vpermq m1, m12, m6
+ vpermq m6, m12, m2
+ vpermq m2, m13, m5
+ vpermq m5, m13, m3
+ vpermq m3, m12, m4
+.pass2_end:
+ lea r6, [strideq*3]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ pmulhrsw m8, m11, m8
+ pmulhrsw m9, m11, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m6
+ pmulhrsw m9, m11, m5
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m3
+ pmulhrsw m9, m11, m2
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m1
+ pmulhrsw m9, m11, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym2, [cq+64*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+64*2]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*1]
+ mova ym5, [cq+64*3]
+ mova ym6, [cq+64*5]
+ mova ym7, [cq+64*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
+ALIGN function_align
+.main_fast_rect2:
+ REPX {paddd x, m13}, m16, m17, m18, m19
+ REPX {psrad x, 12 }, m16, m17, m18, m19
+.main_fast:
+ pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a
+ pmulld m16, [o(pd_401)] {1to16} ; t8a
+ pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a
+ pmulld m19, [o(pd_3166)] {1to16} ; t14a
+ pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a
+ pmulld m17, [o(pd_3920)] {1to16} ; t12a
+ pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a
+ pmulld m18, [o(pd_1931)] {1to16} ; t10a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ call .round2
+ jmp .main2
+.main_rect2:
+ call .round
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a
+ call .round
+.main2:
+ paddd m9, m20, m16 ; t8
+ psubd m20, m16, m20 ; t9
+ psubd m16, m22, m18 ; t10
+ paddd m18, m22 ; t11
+ paddd m22, m23, m19 ; t15
+ psubd m23, m19 ; t14
+ psubd m19, m17, m21 ; t13
+ paddd m17, m21 ; t12
+ vpbroadcastd m11, [o(pd_3784)]
+ REPX {pmaxsd x, m14}, m20, m23, m16, m19
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m20, m23, m16, m19
+ ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
+ ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m9, m18, m22, m17
+ REPX {pminsd x, m15}, m9, m18, m22, m17
+ paddd m21, m20, m19 ; t14
+ psubd m20, m19 ; t13
+ psubd m19, m9, m18 ; t11a
+ paddd m9, m18 ; t8a
+ psubd m18, m23, m16 ; t10
+ paddd m16, m23 ; t9
+ psubd m23, m22, m17 ; t12a
+ paddd m22, m17 ; t15a
+ REPX {pmaxsd x, m14}, m20, m23, m18, m19
+ REPX {pminsd x, m15}, m20, m23, m18, m19
+ REPX {pmulld x, m12}, m20, m23, m18, m19
+ psubd m7, m0, m6 ; dct8 out7
+ paddd m0, m6 ; dct8 out0
+ psubd m6, m1, m5 ; dct8 out6
+ paddd m1, m5 ; dct8 out1
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1
+ psubd m5, m2, m4 ; dct8 out5
+ paddd m2, m4 ; dct8 out2
+ REPX {pminsd x, m15}, m7, m0, m6, m1
+ psubd m4, m3, m8 ; dct8 out4
+ paddd m3, m8 ; dct8 out3
+ REPX {pmaxsd x, m14}, m5, m2, m4, m3
+ paddd m20, m13
+ paddd m23, m13
+ REPX {pminsd x, m15}, m5, m2, m4, m3
+ psubd m17, m20, m18 ; t10a
+ paddd m20, m18 ; t13a
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ psubd m18, m23, m19 ; t11
+ paddd m19, m23 ; t12
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_2)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m23, m0, m22 ; out15
+ paddd m0, m22 ; out0
+ psubd m22, m1, m21 ; out14
+ paddd m1, m21 ; out1
+ psubd m21, m2, m20 ; out13
+ paddd m2, m20 ; out2
+ psubd m20, m3, m19 ; out12
+ paddd m3, m19 ; out3
+ psubd m19, m4, m18 ; out11
+ paddd m4, m18 ; out4
+ psubd m18, m5, m17 ; out10
+ paddd m5, m17 ; out5
+ psubd m17, m6, m16 ; out9
+ paddd m6, m16 ; out6
+ psubd m16, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
+ m4, m20, m5, m21, m6, m22, m7, m23
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m20
+ packssdw m5, m21
+ packssdw m6, m22
+ packssdw m7, m23
+ ret
+.main_end3:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ punpckhdq m7, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m3, m6
+ punpckldq m3, m6
+ vshufi32x4 m6, m0, m4, q3232
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m4, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m7, ym1, 1
+ vshufi32x4 m7, m1, q3232
+ vshufi32x4 m1, m2, m5, q3232
+ vinserti32x8 m2, ym5, 1
+ vshufi32x4 m5, m7, m1, q2020 ; 10 11
+ vshufi32x4 m7, m1, q3131 ; 14 15
+ vshufi32x4 m1, m3, m2, q2020 ; 2 3
+ vshufi32x4 m3, m2, q3131 ; 6 7
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ ret
+ALIGN function_align
+.round:
+ paddd m20, m13
+ paddd m22, m13
+.round2:
+ paddd m16, m13
+ paddd m18, m13
+.round3:
+ REPX {psrad x, 12 }, m16, m18, m20, m22
+ REPX {paddd x, m13}, m17, m19, m21, m23
+ REPX {psrad x, 12 }, m17, m19, m21, m23
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, flipadst
+INV_TXFM_16X16_FN adst, adst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call .main_pass1
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m5, m20
+ packssdw m5, m6, m21
+ packssdw m6, m7, m22
+ packssdw m7, m8, m23
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call .main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_fast_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+.pass1_fast_end2:
+ mova m10, m9
+ mova m11, m8
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m12, [permC]
+ mova m11, [pw_2048_m2048]
+ psrlq m13, m12, 8
+ vpermq m8, m13, m0
+ vpermq m0, m12, m7
+ vpermq m7, m13, m1
+ vpermq m1, m12, m6
+ vpermq m6, m13, m2
+ vpermq m2, m12, m5
+ vpermq m5, m13, m3
+ vpermq m3, m12, m4
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+ALIGN function_align
+.main_pass1:
+ mova m0, [cq+64* 0]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m23, [cq+64*15]
+ vpbroadcastd m13, [o(pd_2048)]
+ ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0
+ mova m7, [cq+64* 7]
+ mova m16, [cq+64* 8]
+ ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8
+ mova m2, [cq+64* 2]
+ mova m21, [cq+64*13]
+ ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2
+ mova m5, [cq+64* 5]
+ mova m18, [cq+64*10]
+ ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
+ mova m4, [cq+64* 4]
+ mova m19, [cq+64*11]
+ ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4
+ mova m3, [cq+64* 3]
+ mova m20, [cq+64*12]
+ ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
+ mova m6, [cq+64* 6]
+ mova m17, [cq+64* 9]
+ ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6
+ mova m1, [cq+64* 1]
+ mova m22, [cq+64*14]
+ ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psubd m9, m23, m7 ; t9a
+ paddd m23, m7 ; t1a
+ psubd m7, m2, m18 ; t10a
+ paddd m18, m2 ; t2a
+ REPX {pmaxsd x, m14}, m9, m23, m7, m18
+ psubd m2, m17, m1 ; t15a
+ paddd m17, m1 ; t7a
+ REPX {pminsd x, m15}, m9, m23, m7, m18
+ psubd m1, m21, m5 ; t11a
+ paddd m21, m5 ; t3a
+ REPX {pmaxsd x, m14}, m2, m17, m1, m21
+ psubd m5, m4, m20 ; t12a
+ paddd m4, m20 ; t4a
+ REPX {pminsd x, m15}, m2, m17, m1, m21
+ psubd m20, m19, m3 ; t13a
+ paddd m19, m3 ; t5a
+ REPX {pmaxsd x, m14}, m5, m4, m20, m19
+ psubd m8, m6, m22 ; t14a
+ paddd m6, m22 ; t6a
+ REPX {pminsd x, m15}, m5, m4, m20, m19
+ psubd m22, m0, m16 ; t8a
+ paddd m16, m0 ; t0a
+ REPX {pmaxsd x, m14}, m8, m6, m22, m16
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m8, m6, m22, m16
+ ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8
+ ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10
+ ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15
+ paddd m0, m16, m4 ; t0
+ psubd m16, m4 ; t4
+ psubd m3, m23, m19 ; t5
+ paddd m23, m19 ; t1
+ REPX {pmaxsd x, m14}, m0, m16, m3, m23
+ psubd m19, m18, m6 ; t6
+ paddd m18, m6 ; t2
+ REPX {pminsd x, m15}, m0, m16, m3, m23
+ psubd m6, m21, m17 ; t7
+ paddd m21, m17 ; t3
+ REPX {pmaxsd x, m14}, m19, m18, m6, m21
+ paddd m17, m9, m20 ; t8a
+ psubd m9, m20 ; t12a
+ REPX {pminsd x, m15}, m19, m18, m6, m21
+ psubd m20, m22, m5 ; t13a
+ paddd m22, m5 ; t9a
+ REPX {pmaxsd x, m14}, m17, m9, m20, m22
+ psubd m5, m1, m2 ; t14a
+ paddd m1, m2 ; t10a
+ REPX {pminsd x, m15}, m17, m9, m20, m22
+ psubd m2, m7, m8 ; t15a
+ paddd m7, m8 ; t11a
+ REPX {pmaxsd x, m14}, m5, m1, m2, m7
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m5, m1, m2, m7
+ ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a
+ ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
+ ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
+ ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15
+ psubd m8, m0, m18 ; t2a
+ paddd m0, m18 ; out0
+ psubd m18, m23, m21 ; t3a
+ paddd m23, m21 ; -out15
+ paddd m21, m9, m5 ; -out13
+ psubd m9, m5 ; t15a
+ psubd m5, m3, m6 ; t6
+ paddd m3, m6 ; -out3
+ REPX {pmaxsd x, m14}, m8, m18, m9, m5
+ psubd m6, m20, m2 ; t14a
+ paddd m2, m20 ; out2
+ paddd m20, m16, m19 ; out12
+ psubd m16, m19 ; t7
+ REPX {pminsd x, m15}, m8, m18, m9, m5
+ psubd m19, m22, m7 ; t11
+ paddd m22, m7 ; out14
+ psubd m7, m17, m1 ; t10
+ paddd m1, m17 ; -out1
+ REPX {pmaxsd x, m14}, m6, m16, m19, m7
+ vpbroadcastd m12, [o(pd_1448)]
+ vpbroadcastd m4, [o(pd_2)]
+ vpbroadcastd m10, [o(pd_5120)]
+ vpbroadcastd m11, [o(pd_5119)]
+ REPX {pminsd x, m15}, m6, m16, m19, m7
+ psubd m17, m7, m19 ; -out9
+ paddd m7, m19 ; out6
+ psubd m19, m5, m16 ; -out11
+ paddd m5, m16 ; out4
+ REPX {pmulld x, m12}, m17, m7, m19, m5
+ psubd m16, m8, m18 ; out8
+ paddd m8, m18 ; -out7
+ psubd m18, m6, m9 ; out10
+ paddd m6, m9 ; -out5
+ REPX {pmulld x, m12}, m16, m8, m18, m6
+ REPX {paddd x, m4 }, m0, m2, m20, m22
+ REPX {psubd x, m4, x}, m1, m3, m21, m23
+ REPX {paddd x, m10 }, m7, m5, m16, m18
+ REPX {psubd x, m11, x}, m17, m19, m8, m6
+ REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
+ REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
+ ret
+ALIGN function_align
+.main_pass1_fast:
+ mova ym0, [cq+64*0]
+ mova ym1, [cq+64*2]
+ movshdup m8, [o(permB)]
+ mova ym6, [cq+64*1]
+ mova ym7, [cq+64*3]
+ mova ym2, [cq+64*4]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*5]
+ mova ym5, [cq+64*7]
+ vpermt2q m0, m8, m1 ; 0 2
+ vpermt2q m7, m8, m6 ; 3 1
+ vpermt2q m2, m8, m3 ; 4 6
+ vpermt2q m5, m8, m4 ; 7 5
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ jmp m(iadst_16x8_internal_10bpc).main_fast
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call m(iadst_16x16_internal_10bpc).main_pass1
+ packssdw m4, m19, m3
+ packssdw m3, m20, m5
+ packssdw m5, m18, m2
+ packssdw m2, m21, m6
+ packssdw m6, m17, m1
+ packssdw m1, m22, m7
+ packssdw m7, m16, m0
+ packssdw m0, m23, m8
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_16x16_internal_10bpc).main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m12, [permC]
+ movu m11, [pw_m2048_2048]
+ psrlq m13, m12, 8
+ vpermq m8, m13, m7
+ vpermq m7, m13, m6
+ vpermq m6, m13, m5
+ vpermq m5, m13, m4
+ vpermq m3, m12, m3
+ vpermq m2, m12, m2
+ vpermq m1, m12, m1
+ vpermq m0, m12, m0
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m10, [o(pd_5793)]
+ vpbroadcastd m11, [o(pd_5120)]
+ mov r6, cq
+ cmp eobd, 36
+ jl .fast
+ call .pass1_main
+ packssdw m0, m6, m8
+ packssdw m1, m7, m9
+ call .pass1_main
+ packssdw m2, m6, m8
+ packssdw m3, m7, m9
+ call .pass1_main
+ packssdw m4, m6, m8
+ packssdw m5, m7, m9
+ call .pass1_main
+ packssdw m6, m8
+ packssdw m7, m9
+ jmp m(idct_16x16_internal_10bpc).pass1_end2
+.fast:
+ call .pass1_main_fast
+ packssdw m0, m6, m7
+ call .pass1_main_fast
+ packssdw m1, m6, m7
+ call .pass1_main_fast
+ packssdw m2, m6, m7
+ call .pass1_main_fast
+ packssdw m3, m6, m7
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckldq m3, m4, m1
+ punpckhdq m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ pxor m7, m7
+ vshufi32x4 m2, m0, m3, q3131
+ vshufi32x4 m0, m3, q2020
+ vshufi32x4 m3, m1, m4, q3131
+ vshufi32x4 m1, m4, q2020
+ REPX {mova x, m7}, m4, m5, m6
+ jmp m(idct_16x16_internal_10bpc).pass1_end3
+.pass2:
+ movshdup m14, [o(permC)]
+ vpbroadcastd m15, [o(pw_1697x16)]
+ lea r6, [strideq*3]
+ vpbroadcastd m11, [o(pw_2048)]
+ pxor m12, m12
+ vpbroadcastd m13, [pixel_10bpc_max]
+ vpermq m8, m14, m0
+ vpermq m9, m14, m1
+ call .pass2_main
+ vpermq m8, m14, m2
+ vpermq m9, m14, m3
+ call .pass2_main
+ vpermq m8, m14, m4
+ vpermq m9, m14, m5
+ call .pass2_main
+ vpermq m8, m14, m6
+ vpermq m9, m14, m7
+.pass2_main:
+ pmulhrsw m0, m15, m8
+ pmulhrsw m1, m15, m9
+ paddsw m8, m8
+ paddsw m9, m9
+ paddsw m8, m0
+ paddsw m9, m1
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.pass1_main:
+ pmulld m6, m10, [r6+64*0]
+ pmulld m7, m10, [r6+64*1]
+ pmulld m8, m10, [r6+64*8]
+ pmulld m9, m10, [r6+64*9]
+ add r6, 64*2
+ REPX {paddd x, m11}, m6, m7, m8, m9
+ REPX {psrad x, 13 }, m6, m8, m7, m9
+ ret
+ALIGN function_align
+.pass1_main_fast:
+ mova ym6, [r6+64* 0]
+ vinserti32x8 m6, [r6+64* 4], 1
+ mova ym7, [r6+64* 8]
+ vinserti32x8 m7, [r6+64*12], 1
+ add r6, 64
+ REPX {pmulld x, m10}, m6, m7
+ REPX {paddd x, m11}, m6, m7
+ REPX {psrad x, 13 }, m6, m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ vpbroadcastd m11, [o(pd_2)]
+ mova m20, [o(idct8x32p)]
+ pxor m21, m21
+ cmp eobd, 43
+ jl .fast
+ call .pass1_main
+ punpcklwd m16, m0, m1
+ punpcklwd m17, m2, m3
+ punpckhwd m18, m0, m1
+ punpckhwd m19, m2, m3
+ cmp eobd, 107
+ jge .full
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ call m(idct_8x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.full:
+ add cq, 64
+ call .pass1_main
+ punpcklwd m5, m0, m1
+ punpcklwd m6, m2, m3
+ punpckhwd m7, m0, m1
+ punpckhwd m8, m2, m3
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ punpckldq m4, m5, m6 ; 16 18
+ punpckhdq m5, m6 ; 20 22
+ punpckldq m6, m7, m8 ; 24 26
+ punpckhdq m7, m8 ; 28 30
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ vextracti32x8 ym18, m4, 1
+ vextracti32x8 ym19, m5, 1
+ vextracti32x8 ym20, m6, 1
+ vextracti32x8 ym21, m7, 1
+ call m(idct_8x16_internal_8bpc).main
+ REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .end
+.fast:
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*1]
+ mova ym5, [cq+128*5]
+ mova ym7, [cq+128*3]
+ mova ym3, [cq+128*7]
+ mova ym0, [cq+128*0]
+ mova ym4, [cq+128*2]
+ mova ym2, [cq+128*4]
+ mova ym6, [cq+128*6]
+ vpermt2q m1, m8, m5 ; 1 5
+ vpermt2q m3, m8, m7 ; 7 3
+ vpermt2q m0, m8, m4 ; 0 2
+ vpermt2q m2, m8, m6 ; 4 6
+ mova [cq+128*0], ym21
+ REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ packssdw m0, m2
+ packssdw m1, m3
+ vpermb m0, m20, m0
+ vprold m20, 16
+ vpermb m2, m20, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ call m(idct_8x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
+.end:
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
+ lea r3, [strideq*2]
+ vpbroadcastd m12, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m11, m11
+ lea r3, [dstq+r3*8]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ call .write_8x4x2
+ pmulhrsw m0, m10, m2
+ pmulhrsw m1, m10, m3
+ call .write_8x4x2
+ pmulhrsw m0, m10, m4
+ pmulhrsw m1, m10, m5
+ call .write_8x4x2
+ pmulhrsw m0, m10, m6
+ pmulhrsw m1, m10, m7
+.write_8x4x2:
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ vinserti32x4 m8, [dstq+strideq*2], 2
+ vinserti32x4 m8, [dstq+r6 ], 3
+ mova xm9, [r3 +r6 ]
+ vinserti32x4 ym9, [r3 +strideq*2], 1
+ vinserti32x4 m9, [r3 +strideq*1], 2
+ vinserti32x4 m9, [r3 +strideq*0], 3
+ paddw m8, m0
+ paddw m9, m1
+ pmaxsw m8, m11
+ pmaxsw m9, m11
+ pminsw m8, m12
+ pminsw m9, m12
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r6 ], m8, 3
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 [r3 +strideq*0], m9, 3
+ vextracti32x4 [r3 +strideq*1], m9, 2
+ vextracti32x4 [r3 +strideq*2], ym9, 1
+ mova [r3 +r6 ], xm9
+ lea r3, [r3+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ALIGN function_align
+.pass1_main:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_end2
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ REPX {vpermb x, m20, x}, m0, m1, m2, m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ pxor m10, m10
+ lea r5, [strideq*5]
+ vpbroadcastd m11, [pixel_10bpc_max]
+ sub eobd, 107
+ lea r6, [strideq+r4*2]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ lea r7, [dstq+strideq*8]
+ REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
+ REPX {paddsw x, m9}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ add cq, 64
+ mova xm4, [dstq+strideq*0]
+ mova xm5, [dstq+strideq*1]
+ mova xm6, [dstq+strideq*2]
+ mova xm7, [dstq+r4 *1]
+ punpckhwd m8, m0, m1
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ punpcklwd m0, m1
+ vinserti32x4 ym5, [dstq+r5 *1], 1
+ punpckhwd m1, m2, m3
+ vinserti32x4 ym6, [dstq+r4 *2], 1
+ punpcklwd m2, m3
+ vinserti32x4 ym7, [dstq+r6 *1], 1
+ punpckhwd m3, m0, m8
+ vinserti32x4 m4, [r7 +strideq*0], 2
+ punpcklwd m0, m8
+ vinserti32x4 m5, [r7 +strideq*1], 2
+ punpckhwd m8, m2, m1
+ vinserti32x4 m6, [r7 +strideq*2], 2
+ punpcklwd m2, m1
+ vinserti32x4 m7, [r7 +r4 *1], 2
+ punpckhqdq m1, m0, m2
+ vinserti32x4 m4, [r7 +strideq*4], 3
+ punpcklqdq m0, m2
+ vinserti32x4 m5, [r7 +r5 *1], 3
+ punpcklqdq m2, m3, m8
+ vinserti32x4 m6, [r7 +r4 *2], 3
+ punpckhqdq m3, m8
+ vinserti32x4 m7, [r7 +r6 *1], 3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ REPX {pmaxsw x, m10}, m0, m1, m2, m3
+ REPX {pminsw x, m11}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+r4 *1], xm3
+ vextracti32x4 [dstq+strideq*4], ym0, 1
+ vextracti32x4 [dstq+r5 *1], ym1, 1
+ vextracti32x4 [dstq+r4 *2], ym2, 1
+ vextracti32x4 [dstq+r6 *1], ym3, 1
+ lea dstq, [r7+strideq*8]
+ vextracti32x4 [r7 +strideq*0], m0, 2
+ vextracti32x4 [r7 +strideq*1], m1, 2
+ vextracti32x4 [r7 +strideq*2], m2, 2
+ vextracti32x4 [r7 +r4 *1], m3, 2
+ vextracti32x4 [r7 +strideq*4], m0, 3
+ vextracti32x4 [r7 +r5 *1], m1, 3
+ vextracti32x4 [r7 +r4 *2], m2, 3
+ vextracti32x4 [r7 +r6 *1], m3, 3
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m11, [o(permB)]
+ mova m0, [cq+64* 0] ; 0 1
+ mova m4, [cq+64* 1] ; 2 3
+ mova m1, [cq+64* 2] ; 4 5
+ mova m8, [cq+64* 3] ; 6 7
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m10, m11, 32
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m16, m11
+ vpermi2q m16, m0, m1 ; 1 5
+ mova m17, m11
+ vpermi2q m17, m8, m4 ; 7 3
+ cmp eobd, 43
+ jl .fast
+ mova m18, [cq+64* 4] ; 8 9
+ mova m20, [cq+64* 5] ; 10 11
+ mova m6, [cq+64* 6] ; 12 13
+ mova m7, [cq+64* 7] ; 14 15
+ vpermt2q m0, m10, m18 ; 0 8
+ vpermt2q m18, m11, m6 ; 9 13
+ mova m19, m11
+ vpermi2q m19, m7, m20 ; 15 11
+ cmp eobd, 107
+ jge .full
+ vpermt2q m1, m10, m6 ; 4 12
+ vpermt2q m4, m10, m8 ; 2 6
+ vpermt2q m7, m10, m20 ; 14 10
+ mov r6d, 64*1
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ call .main_fast
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.full:
+ mova m2, [cq+64* 8] ; 16 17
+ mova m5, [cq+64* 9] ; 18 19
+ mova m9, [cq+64*10] ; 20 21
+ mova m21, [cq+64*11] ; 22 23
+ vpermt2q m1, m10, m9 ; 4 20
+ vpermt2q m7, m10, m21 ; 14 22
+ vpermt2q m21, m11, m5 ; 23 19
+ vpermt2q m5, m10, m20 ; 18 10
+ mova m20, m11
+ vpermi2q m20, m2, m9 ; 17 21
+ mova m22, [cq+64*12] ; 24 25
+ mova m9, [cq+64*13] ; 26 27
+ mova m3, [cq+64*14] ; 28 29
+ mova m23, [cq+64*15] ; 30 31
+ vpermt2q m2, m10, m22 ; 16 24
+ vpermt2q m22, m11, m3 ; 25 29
+ vpermt2q m3, m10, m6 ; 28 12
+ vpermt2q m4, m10, m9 ; 2 26
+ mova m6, m10
+ vpermi2q m6, m23, m8 ; 30 6
+ vpermt2q m23, m11, m9 ; 31 27
+ mov r6d, 64*3
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_16x8_internal_10bpc).main
+ call .main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.fast:
+ vpermq m0, m10, m0 ; 0 0
+ vpermq m1, m10, m1 ; 4 4
+ vpermt2q m4, m10, m8 ; 2 6
+ xor r6d, r6d
+ call .main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+.end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+ call .transpose_8x32
+ pxor m14, m14
+.zero_loop:
+ mova [cq+r6*4+64*3], m14
+ mova [cq+r6*4+64*2], m14
+ mova [cq+r6*4+64*1], m14
+ mova [cq+r6*4+64*0], m14
+ sub r6d, 64
+ jge .zero_loop
+ lea r5, [o_base_8bpc]
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pxor m12, m12
+.write_32x8_start:
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+.write_32x8:
+ pmulhrsw m0, m11
+ pmulhrsw m1, m11
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+ call .write_32x4
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+.write_32x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3 ]
+ REPX {pmaxsw x, m12}, m0, m1, m2, m3
+ REPX {pminsw x, m13}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 8
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m8, [o(pd_799_4017)]
+ pmulld m8, m1 ; t4 t7
+ vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1
+ REPX {paddd x, m13}, m8, m0
+ REPX {psrad x, 12 }, m8, m0
+ pmulld m3, m8, m12
+ mova m2, m0 ; t3 t2
+ call m(idct_8x8_internal_10bpc).main3
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ pmulld m6, m4 ; t15 t12
+ pmulld m4, m3 ; t9 t10
+ REPX {paddd x, m13}, m6, m4
+ REPX {psrad x, 12 }, m6, m4
+ mova m5, m6 ; t14 t13
+ mova m9, m4 ; t8 t11
+ call m(idct_16x8_internal_10bpc).main3
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m9, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16 t20
+ pmulld m16, m7 ; t31 t27
+ pmulld m22, m17 ; -t19 -t25
+ pmulld m17, m9 ; t28 t24
+ REPX {paddd x, m13}, m23, m16, m17
+ psubd m22, m13, m22
+ REPX {psrad x, 12 }, m23, m16, m22, m17
+ mova m20, m23 ; t30 t26
+ mova m9, m16 ; t17 t21
+ mova m19, m22 ; t18 t22
+ mova m18, m17 ; t29 t25
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m20, [o(pd_2751_2106)]
+ vbroadcasti32x4 m9, [o(pd_3035_3513)]
+ vbroadcasti32x4 m21, [o(pd_3703_3290)]
+ vbroadcasti32x4 m10, [o(pd_1751_2440)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16a t20a
+ pmulld m16, m7 ; t31a t27a
+ pmulld m20, m19 ; -t17a -t21a
+ pmulld m19, m9 ; t30a t26a
+ pmulld m21, m18 ; t18a t22a
+ pmulld m18, m10 ; t29a t25a
+ pmulld m22, m17 ; -t19a -t25a
+ pmulld m17, m11 ; t28a t24a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601
+ paddd m20, m13
+ paddd m22, m13
+.main2:
+ REPX {paddd x, m13}, m16, m23, m19
+ REPX {psrad x, 12 }, m16, m20, m23, m19
+ psubd m9, m16, m20 ; t17 t21
+ paddd m16, m20 ; t16 t20
+ psubd m20, m23, m19 ; t30 t26
+ paddd m23, m19 ; t31 t27
+ REPX {pmaxsd x, m14}, m9, m16, m20, m23
+ REPX {paddd x, m13}, m21, m18, m17
+ REPX {psrad x, 12 }, m18, m22, m21, m17
+ psubd m19, m22, m18 ; t18 t22
+ paddd m22, m18 ; t19 t23
+ psubd m18, m17, m21 ; t29 t25
+ paddd m17, m21 ; t28 t24
+ REPX {pmaxsd x, m14}, m19, m22, m18, m17
+ REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
+.main3:
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ psubd m7, m0, m6 ; dct16 out15 out14
+ paddd m0, m6 ; dct16 out0 out1
+ psubd m6, m1, m5 ; dct16 out12 out13
+ paddd m1, m5 ; dct16 out3 out2
+ psubd m5, m2, m4 ; dct16 out11 out10
+ paddd m2, m4 ; dct16 out4 out5
+ psubd m4, m3, m8 ; dct16 out8 out9
+ paddd m3, m8 ; dct16 out7 out6
+ ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11
+ ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
+ punpckhqdq m21, m16, m20 ; t20 t21a
+ punpcklqdq m16, m20 ; t16 t17a
+ punpcklqdq m20, m22, m19 ; t19 t18a
+ punpckhqdq m22, m19 ; t23 t22a
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpcklqdq m19, m23, m9 ; t31 t30a
+ punpckhqdq m23, m9 ; t27 t26a
+ punpckhqdq m9, m17, m18 ; t24 t25a
+ punpcklqdq m17, m18 ; t28 t29a
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ psubd m18, m16, m20 ; t19a t18
+ paddd m20, m16 ; t16a t17
+ psubd m16, m19, m17 ; t28a t29
+ paddd m19, m17 ; t31a t30
+ psubd m17, m22, m21 ; t20a t21
+ paddd m22, m21 ; t23a t22
+ psubd m21, m9, m23 ; t27a t26
+ paddd m23, m9 ; t24a t25
+ REPX {pmaxsd x, m14}, m18, m16, m17, m21
+ REPX {pminsd x, m15}, m16, m18, m21, m17
+ ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m20, m22, m19, m23
+ REPX {pminsd x, m15}, m20, m22, m19, m23
+ paddd m9, m20, m22 ; t16 t17a
+ psubd m20, m22 ; t23 t22a
+ paddd m22, m19, m23 ; t31 t30a
+ psubd m19, m23 ; t24 t25a
+ psubd m23, m16, m17 ; t20a t21
+ paddd m16, m17 ; t19a t18
+ psubd m17, m18, m21 ; t27a t26
+ paddd m21, m18 ; t28a t29
+ REPX {pmaxsd x, m14}, m20, m19, m23, m17
+ REPX {pminsd x, m15}, m19, m20, m17, m23
+ REPX {pmulld x, m12}, m19, m20, m17, m23
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ paddd m19, m13
+ paddd m17, m13
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ psubd m18, m19, m20 ; t23a t22
+ paddd m19, m20 ; t24a t25
+ paddd m20, m17, m23 ; t27 t26a
+ psubd m17, m23 ; t20 t21a
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.transpose_8x32:
+ mova m10, [o(idct32x8p)]
+ psrlw m8, m10, 8
+ mova m9, m8
+ vpermi2w m8, m1, m5
+ vpermt2w m1, m10, m5
+ vprold m5, m9, 16
+ vpermi2w m9, m3, m7
+ vpermt2w m3, m10, m7
+ vprold m10, 16
+ mova m7, m5
+ vpermi2w m5, m0, m4
+ vpermt2w m0, m10, m4
+ vpermi2w m7, m2, m6
+ vpermt2w m2, m10, m6
+ punpckhdq m6, m5, m8
+ punpckldq m5, m8
+ punpckhdq m8, m7, m9
+ punpckldq m7, m9
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpckhdq m3, m0, m1
+ punpckldq m0, m1
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
+ vpbroadcastd m5, [pw_4096]
+ lea r4, [strideq*3]
+ mova m6, [idtx32x8p]
+ lea r5, [strideq*5]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r6, [strideq+r4*2]
+ pxor m8, m8
+ sub eobd, 107
+ psrlw m7, m6, 8
+.loop:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1] ; 02 13
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3] ; 46 57
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5] ; 8a 9b
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7] ; ce df
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
+ mova m4, m6
+ vpermi2w m4, m1, m3
+ vpermt2w m1, m7, m3
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ mova m3, m7
+ vpermi2w m3, m0, m2
+ vpermt2w m0, m6, m2
+ add cq, 64*8
+ punpcklqdq m2, m3, m1 ; 4 5
+ punpckhqdq m3, m1 ; 6 7
+ punpckhqdq m1, m0, m4 ; 2 3
+ punpcklqdq m0, m4 ; 0 1
+ mova ym4, [dstq+strideq*0]
+ vinserti32x8 m4, [dstq+strideq*1], 1
+ paddw m0, m4
+ mova ym4, [dstq+strideq*2]
+ vinserti32x8 m4, [dstq+r4 *1], 1
+ paddw m1, m4
+ mova ym4, [dstq+strideq*4]
+ vinserti32x8 m4, [dstq+r5 *1], 1
+ paddw m2, m4
+ mova ym4, [dstq+r4 *2]
+ vinserti32x8 m4, [dstq+r6 *1], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+r4 *1], m1, 1
+ mova [dstq+strideq*4], ym2
+ vextracti32x8 [dstq+r5 *1], m2, 1
+ mova [dstq+r4 *2], ym3
+ vextracti32x8 [dstq+r6 *1], m3, 1
+ add dstq, 32
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ cmp eobd, 36
+ jl .fast
+ call .pass1
+ cmp eobd, 151
+ jge .full
+ lea r5, [o_base_8bpc]
+ pxor m9, m9
+ punpcklwd m8, m1, m1 ; 2
+ punpckhwd m14, m1, m1 ; 3
+ punpcklwd m1, m3, m3 ; 6
+ punpckhwd m15, m3, m3 ; 7
+ punpcklwd m3, m6, m6 ; 12
+ punpckhwd m19, m6, m6 ; 13
+ punpcklwd m6, m9, m4 ; __ 8
+ punpckhwd m20, m4, m4 ; 9
+ punpckhwd m16, m5, m5 ; 11
+ punpcklwd m5, m5 ; 10
+ punpcklwd m9, m0 ; __ 0
+ punpckhwd m21, m0, m0 ; 1
+ punpcklwd m0, m7, m7 ; 14
+ punpckhwd m17, m7, m7 ; 15
+ punpcklwd m7, m2, m2 ; 4
+ punpckhwd m18, m2, m2 ; 5
+ call m(idct_16x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mov r6d, 64*3
+ pxor m8, m8
+.zero_loop:
+ REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0
+ sub r6d, 64
+ jge .zero_loop
+ jmp .pass2_end
+.full:
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 64
+ call .pass1
+ mova m9, [cq-64* 1] ; 0 1
+ mova m14, [cq+64* 1] ; 2 3
+ mova m18, [cq+64* 3] ; 4 5
+ mova m15, [cq+64* 5] ; 6 7
+ mova m20, [cq+64* 7] ; 8 9
+ mova m16, [cq+64* 9] ; 10 11
+ mova m22, [cq+64*11] ; 12 13
+ mova m19, [cq+64*13] ; 14 15
+ lea r5, [o_base_8bpc]
+ punpcklwd m8, m7, m14 ; 30 2
+ punpckhwd m21, m7, m9 ; 31 1
+ punpcklwd m7, m6, m18 ; 28 4
+ punpckhwd m14, m6 ; 3 29
+ punpcklwd m9, m0, m9 ; 16 0
+ punpckhwd m17, m19, m0 ; 15 17
+ punpcklwd m0, m19, m1 ; 14 18
+ punpckhwd m19, m1, m22 ; 19 13
+ punpcklwd m1, m15, m5 ; 6 26
+ punpckhwd m18, m5, m18 ; 27 5
+ punpcklwd m6, m4, m20 ; 24 8
+ punpckhwd m15, m4 ; 7 25
+ punpcklwd m5, m3, m16 ; 22 10
+ punpckhwd m20, m3, m20 ; 23 9
+ punpcklwd m3, m22, m2 ; 12 20
+ punpckhwd m16, m2 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ mov r6d, 32*7
+ pxor m8, m8
+.full_zero_loop:
+ REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1
+ sub r6d, 32
+ jge .full_zero_loop
+ jmp .pass2_end
+.fast:
+ mova ym0, [cq+128*0]
+ mova ym2, [cq+128*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*2]
+ mova ym3, [cq+128*6]
+ mova ym4, [cq+128*1]
+ mova ym5, [cq+128*3]
+ mova ym6, [cq+128*5]
+ mova ym7, [cq+128*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ REPX {pmulld x, m12}, m0, m1, m4, m7
+ pxor ym16, ym16
+ mova [cq+128*0], ym16
+ REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7
+ REPX {paddd x, m13}, m0, m1, m4, m7
+ REPX {psrad x, 12 }, m0, m1, m4, m7
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(idct8x32p)]
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m6, [dup16_perm]
+ vpermb m0, m8, m0
+ vpermb m2, m8, m2
+ vprold m8, 16
+ vpermb m1, m8, m1
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m21, m4, m2
+ punpckhdq m14, m4, m2
+ punpckldq m18, m0, m1
+ punpckhdq m15, m0, m1
+ vpermb m8, m6, m14 ; 2
+ vpermb m1, m6, m15 ; 6
+ vpermb m7, m6, m18 ; 4
+ pmovzxwd m9, ym21 ; 0
+ vpord m6, [o(pb_32)] {1to16}
+ lea r5, [o_base_8bpc]
+ vpermb m21, m6, m21 ; 1
+ vpermb m15, m6, m15 ; 7
+ vpermb m18, m6, m18 ; 5
+ vpermb m14, m6, m14 ; 3
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+.pass2_end:
+ movshdup m22, [permC]
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m12, m12
+ psrlq m23, m22, 8
+ vpermq m8, m22, m0
+ vpermq m9, m23, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m2
+ vpermq m9, m23, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m4
+ vpermq m9, m23, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m6
+ vpermq m9, m23, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m14
+ vpermq m9, m23, m15
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m16
+ vpermq m9, m23, m17
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m18
+ vpermq m9, m23, m19
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m20
+ vpermq m9, m23, m21
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ vzeroupper
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+.pass1:
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 2]
+ pmulld m2, m12, [cq+128* 4]
+ pmulld m3, m12, [cq+128* 6]
+ pmulld m4, m12, [cq+128* 8]
+ pmulld m5, m12, [cq+128*10]
+ pmulld m6, m12, [cq+128*12]
+ pmulld m7, m12, [cq+128*14]
+ call m(idct_8x16_internal_10bpc).main_rect2
+ pmulld m16, m12, [cq+128* 1]
+ pmulld m17, m12, [cq+128* 3]
+ pmulld m18, m12, [cq+128* 5]
+ pmulld m19, m12, [cq+128* 7]
+ pmulld m20, m12, [cq+128* 9]
+ pmulld m21, m12, [cq+128*11]
+ pmulld m22, m12, [cq+128*13]
+ pmulld m23, m12, [cq+128*15]
+ call m(idct_16x16_internal_10bpc).main_rect2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+ jmp m(idct_16x16_internal_10bpc).main_end3
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
+
+cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m10, [pw_2896x8]
+ vpbroadcastd m11, [pw_1697x16]
+ vpbroadcastd m13, [pw_8192]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ lea r6, [strideq*9]
+ pxor m14, m14
+ paddw m12, m13, m13 ; pw_16384
+ cmp eobd, 151
+ jl .main
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+.main:
+ call .main_internal
+ add cq, 128*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+.main2:
+ pmulhrsw m2, m13
+ pmulhrsw m4, m13
+ pmulhrsw m6, m13
+ pmulhrsw m8, m13
+ punpcklqdq m0, m1, m2 ; 0 8
+ punpckhqdq m1, m2 ; 1 9
+ call .write_16x2x2
+ punpcklqdq m0, m3, m4 ; 2 10
+ punpckhqdq m1, m3, m4 ; 3 11
+ call .write_16x2x2
+ punpcklqdq m0, m5, m6 ; 4 12
+ punpckhqdq m1, m5, m6 ; 5 13
+ call .write_16x2x2
+ punpcklqdq m0, m7, m8 ; 6 14
+ punpckhqdq m1, m7, m8 ; 7 15
+.write_16x2x2:
+ mova ym2, [dstq+strideq*0]
+ vinserti32x8 m2, [dstq+strideq*8], 1
+ mova ym9, [dstq+strideq*1]
+ vinserti32x8 m9, [dstq+r6 ], 1
+ paddw m0, m2
+ paddw m1, m9
+ pmaxsw m0, m14
+ pmaxsw m1, m14
+ pminsw m0, m15
+ pminsw m1, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*8], m0, 1
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*2]
+ ret
+.main_internal:
+ mova m8, [cq+128* 0]
+ packssdw m8, [cq+128* 8]
+ mova m6, [cq+128* 1]
+ packssdw m6, [cq+128* 9]
+ mova m0, [cq+128* 2]
+ packssdw m0, [cq+128*10]
+ mova m2, [cq+128* 3]
+ packssdw m2, [cq+128*11]
+ REPX {pmulhrsw x, m10}, m8, m6, m0, m2
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ pmulhrsw m4, m11, m8
+ pmulhrsw m9, m11, m6
+ REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
+ pmulhrsw m4, m12
+ pmulhrsw m9, m12
+ paddsw m8, m4
+ paddsw m6, m9
+ pmulhrsw m4, m11, m0
+ pmulhrsw m9, m11, m2
+ REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
+ pmulhrsw m4, m12
+ pmulhrsw m9, m12
+ paddsw m0, m4
+ paddsw m2, m9
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mov r6d, 8*12
+ cmp eobd, 36
+ jl .fast
+ pmulld m0, m12, [cq+64* 0]
+ pmulld m1, m12, [cq+64* 4]
+ pmulld m2, m12, [cq+64* 8]
+ pmulld m3, m12, [cq+64*12]
+ pmulld m16, m12, [cq+64* 2]
+ pmulld m17, m12, [cq+64* 6]
+ pmulld m18, m12, [cq+64*10]
+ pmulld m19, m12, [cq+64*14]
+ cmp eobd, 151
+ jge .full
+ call m(idct_8x16_internal_10bpc).main_fast_rect2
+ call m(idct_16x16_internal_10bpc).main_fast_rect2
+ call .idct16_sumsub
+ call .pass1_load_spill
+ call .main_fast_rect2
+ jmp .pass1_end
+.full:
+ pmulld m4, m12, [cq+64*16]
+ pmulld m5, m12, [cq+64*20]
+ pmulld m6, m12, [cq+64*24]
+ pmulld m7, m12, [cq+64*28]
+ pmulld m20, m12, [cq+64*18]
+ pmulld m21, m12, [cq+64*22]
+ pmulld m22, m12, [cq+64*26]
+ pmulld m23, m12, [cq+64*30]
+ add r6d, 8*16
+ call m(idct_8x16_internal_10bpc).main_rect2
+ call m(idct_16x16_internal_10bpc).main_rect2
+ call .idct16_sumsub
+ call .pass1_load_spill
+ pmulld m16, m12, [cq+64*17]
+ pmulld m17, m12, [cq+64*19]
+ pmulld m18, m12, [cq+64*21]
+ pmulld m19, m12, [cq+64*23]
+ pmulld m20, m12, [cq+64*25]
+ pmulld m21, m12, [cq+64*27]
+ pmulld m22, m12, [cq+64*29]
+ pmulld m23, m12, [cq+64*31]
+ call .main_rect2
+.pass1_end:
+ vpbroadcastd m11, [o(pd_1)]
+ lea r4, [cq+64]
+ call .idct32_pass1_end
+ lea r5, [o_base_8bpc]
+ punpckhqdq m19, m5, m16 ; 11
+ punpcklqdq m5, m16 ; 10
+ punpckhqdq m16, m2, m1 ; 5
+ punpcklqdq m2, m1 ; 4
+ punpcklqdq m1, m15, m4 ; 2
+ punpckhqdq m15, m4 ; 3
+ punpcklqdq m4, m14, m18 ; 8
+ punpckhqdq m18, m14, m18 ; 9
+ punpckhqdq m14, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m20, m6, m17 ; 13
+ punpcklqdq m6, m17 ; 12
+ punpckhqdq m17, m3, m21 ; 7
+ punpcklqdq m3, m21 ; 6
+ punpckhqdq m21, m7, m8 ; 15
+ punpcklqdq m7, m8 ; 14
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ jmp .end
+.fast:
+ pmulld ym0, ym12, [cq+64*0]
+ pmulld ym1, ym12, [cq+64*4]
+ movshdup m7, [o(permB)]
+ mova ym4, [cq+64*2]
+ mova ym5, [cq+64*6]
+ mova ym16, [cq+64*1]
+ mova ym2, [cq+64*5]
+ mova ym3, [cq+64*3]
+ mova ym17, [cq+64*7]
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ paddd ym0, ym13
+ paddd ym1, ym13
+ psrad ym0, 12
+ psrad ym1, 12
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ REPX {pmulld x, m12}, m4, m16, m17
+ REPX {paddd x, m13}, m4, m16, m17
+ REPX {psrad x, 12 }, m4, m16, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m14, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m3, m4 ; 2
+ punpckhqdq m15, m3, m4 ; 3
+ punpcklqdq m2, m5, m7 ; 4
+ punpckhqdq m16, m5, m7 ; 5
+ punpcklqdq m3, m6, m8 ; 6
+ punpckhqdq m17, m6, m8 ; 7
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+.end:
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ pxor m12, m12
+.zero_loop:
+ mova [cq+r6*8+64*3], m12
+ mova [cq+r6*8+64*2], m12
+ mova [cq+r6*8+64*1], m12
+ mova [cq+r6*8+64*0], m12
+ sub r6d, 8*4
+ jge .zero_loop
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
+ pmulhrsw m0, m11, m14
+ pmulhrsw m1, m11, m15
+ pmulhrsw m2, m11, m16
+ pmulhrsw m3, m11, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m18
+ pmulhrsw m1, m11, m19
+ pmulhrsw m2, m11, m20
+ pmulhrsw m3, m11, m21
+ vzeroupper
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd m3, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+strideq*0]
+ paddsw m1, m2, [dstq+strideq*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+.idct16_sumsub:
+ psubd m23, m0, m22 ; t15
+ paddd m0, m22 ; t0
+ psubd m22, m1, m21 ; t14
+ paddd m1, m21 ; t1
+ REPX {pmaxsd x, m14}, m23, m0, m22, m1
+ psubd m21, m2, m20 ; t13
+ paddd m2, m20 ; t2
+ REPX {pminsd x, m15}, m23, m0, m22, m1
+ psubd m20, m3, m19 ; t12
+ paddd m3, m19 ; t3
+ REPX {pmaxsd x, m14}, m21, m2, m20, m3
+ psubd m19, m4, m18 ; t11
+ paddd m4, m18 ; t4
+ REPX {pminsd x, m15}, m21, m2, m20, m3
+ psubd m18, m5, m17 ; t10
+ paddd m5, m17 ; t5
+ REPX {pmaxsd x, m14}, m19, m4, m18, m5
+ psubd m17, m6, m16 ; t9
+ paddd m6, m16 ; t6
+ REPX {pminsd x, m15}, m19, m4, m18, m5
+ psubd m16, m7, m9 ; t8
+ paddd m7, m9 ; t7
+ REPX {pmaxsd x, m14}, m17, m6, m16, m7
+ REPX {pminsd x, m15}, m17, m6, m16, m7
+ ret
+.idct32_pass1_end:
+ psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11
+ psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
+%macro IDCT32_PASS1_END 2 ; low, high
+ paddd m8, m11, [r4+128*%1]
+ paddd m9, m11, [cq+128*%1]
+ psubd m10, m8, m%1 ; out 16+n
+ paddd m8, m%1 ; out 15-n
+ paddd m%1, m9, m%2 ; out 0+n
+ psubd m9, m%2 ; out 31-n
+ REPX {vpsravd x, m11}, m10, m%1, m8, m9
+ packssdw m%1, m10 ; 0+n 16+n
+ packssdw m%2, m8, m9 ; 15-n 31-n
+%endmacro
+ IDCT32_PASS1_END 0, 23 ; 0 16, 15 31
+ IDCT32_PASS1_END 7, 16 ; 7 23, 8 24
+ mova m14, m13
+ vpermi2q m14, m0, m16
+ vpermt2q m0, m12, m16
+ IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
+ IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
+ mova m15, m13
+ vpermi2q m15, m1, m17
+ vpermt2q m1, m12, m17
+ IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
+ IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
+ mova m16, m13
+ vpermi2q m16, m2, m18
+ vpermt2q m2, m12, m18
+ IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
+ IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
+ mova m17, m13
+ vpermi2q m17, m3, m19
+ vpermt2q m3, m12, m19
+ mova m18, m13
+ vpermi2q m18, m4, m20
+ vpermt2q m4, m12, m20
+ mova m19, m13
+ vpermi2q m19, m5, m21
+ vpermt2q m5, m12, m21
+ mova m20, m13
+ vpermi2q m20, m6, m22
+ vpermt2q m6, m12, m22
+ mova m21, m13
+ vpermi2q m21, m7, m23
+ vpermt2q m7, m12, m23
+ punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07
+ punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03
+ punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07
+ punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03
+ punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07
+ punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03
+ punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07
+ punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03
+ punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15
+ punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11
+ punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15
+ punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11
+ punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15
+ punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11
+ punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15
+ punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11
+ punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07
+ punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05
+ punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11
+ punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09
+ punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11
+ punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09
+ punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01
+ punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03
+ punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13
+ punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15
+ punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03
+ punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01
+ punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05
+ punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07
+ punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15
+ punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13
+ ret
+.pass1_load_spill:
+ mova [cq+64* 0], m0
+ mova [cq+64* 2], m1
+ mova [cq+64* 4], m2
+ mova [cq+64* 6], m3
+ mova [cq+64* 8], m4
+ mova [cq+64*10], m5
+ mova [cq+64*12], m6
+ mova [cq+64*14], m7
+ pmulld m0, m12, [cq+64* 1]
+ pmulld m1, m12, [cq+64* 3]
+ pmulld m2, m12, [cq+64* 5]
+ pmulld m3, m12, [cq+64* 7]
+ pmulld m4, m12, [cq+64* 9]
+ pmulld m5, m12, [cq+64*11]
+ pmulld m6, m12, [cq+64*13]
+ pmulld m7, m12, [cq+64*15]
+ mova [cq+64* 1], m23
+ mova [cq+64* 3], m22
+ mova [cq+64* 5], m21
+ mova [cq+64* 7], m20
+ mova [cq+64* 9], m19
+ mova [cq+64*11], m18
+ mova [cq+64*13], m17
+ mova [cq+64*15], m16
+ ret
+.main_fast_rect2:
+ call m(idct_8x16_internal_10bpc).round
+.main_fast: ; bottom half is zero
+ pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
+ pmulld m0, [o(pd_201)] {1to16} ; t16a
+ pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a
+ pmulld m7, [o(pd_3035)] {1to16} ; t30a
+ pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a
+ pmulld m4, [o(pd_1751)] {1to16} ; t18a
+ pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
+ pmulld m3, [o(pd_3857)] {1to16} ; t28a
+ pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a
+ pmulld m2, [o(pd_995)] {1to16} ; t20a
+ pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a
+ pmulld m5, [o(pd_3513)] {1to16} ; t26a
+ pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a
+ pmulld m6, [o(pd_2440)] {1to16} ; t22a
+ pmulld m22, m1, [o(pd_601)] {1to16} ; t23a
+ pmulld m1, [o(pd_4052)] {1to16} ; t24a
+ REPX {psubd x, m13, x}, m16, m20, m18, m22
+ call m(idct_16x16_internal_10bpc).round3
+ jmp .main2
+.main_rect2:
+ call m(idct_8x16_internal_10bpc).round
+ call m(idct_16x16_internal_10bpc).round
+.main:
+ ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ call m(idct_16x16_internal_10bpc).round
+.main2:
+ call m(idct_8x16_internal_10bpc).round
+ psubd m8, m0, m16 ; t17
+ paddd m0, m16 ; t16
+ psubd m16, m23, m7 ; t30
+ paddd m23, m7 ; t31
+ REPX {pmaxsd x, m14}, m8, m0, m16, m23
+ paddd m7, m20, m4 ; t19
+ psubd m20, m4 ; t18
+ REPX {pminsd x, m15}, m8, m0, m16, m23
+ paddd m4, m3, m19 ; t28
+ psubd m3, m19 ; t29
+ REPX {pmaxsd x, m14}, m7, m20, m4, m3
+ psubd m19, m2, m18 ; t21
+ paddd m2, m18 ; t20
+ REPX {pminsd x, m15}, m7, m20, m4, m3
+ psubd m18, m21, m5 ; t26
+ paddd m21, m5 ; t27
+ REPX {pmaxsd x, m14}, m19, m2, m18, m21
+ psubd m5, m22, m6 ; t22
+ paddd m6, m22 ; t23
+ REPX {pminsd x, m15}, m19, m2, m18, m21
+ psubd m22, m1, m17 ; t25
+ paddd m17, m1 ; t24
+ REPX {pmaxsd x, m14}, m5, m6, m22, m17
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m5, m6, m22, m17
+ ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a
+ ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a
+ ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a
+ paddd m1, m6, m2 ; t23a
+ psubd m6, m2 ; t20a
+ psubd m2, m17, m21 ; t27a
+ paddd m17, m21 ; t24a
+ REPX {pmaxsd x, m14}, m1, m6, m2, m17
+ psubd m21, m23, m4 ; t28a
+ paddd m23, m4 ; t31a
+ REPX {pminsd x, m15}, m1, m6, m2, m17
+ psubd m4, m16, m20 ; t18
+ paddd m16, m20 ; t17
+ REPX {pmaxsd x, m14}, m21, m23, m4, m16
+ psubd m20, m0, m7 ; t19a
+ paddd m0, m7 ; t16a
+ REPX {pminsd x, m15}, m21, m23, m4, m16
+ psubd m7, m8, m3 ; t29
+ paddd m3, m8 ; t30
+ REPX {pmaxsd x, m14}, m20, m0, m7, m3
+ paddd m8, m5, m18 ; t22
+ psubd m5, m18 ; t21
+ REPX {pminsd x, m15}, m20, m0, m7, m3
+ psubd m18, m22, m19 ; t26
+ paddd m22, m19 ; t25
+ REPX {pmaxsd x, m14}, m8, m5, m18, m22
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m8, m5, m18, m22
+ ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28
+ ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20
+ ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a
+ ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a
+ psubd m19, m0, m1 ; t23
+ paddd m0, m1 ; t16
+ paddd m1, m8, m16 ; t17a
+ psubd m8, m16, m8 ; t22a
+ REPX {pmaxsd x, m14}, m19, m0, m1, m8
+ psubd m16, m23, m17 ; t24
+ paddd m23, m17 ; t31
+ REPX {pminsd x, m15}, m19, m0, m1, m8
+ psubd m17, m3, m22 ; t25a
+ paddd m22, m3 ; t30a
+ REPX {pmaxsd x, m14}, m16, m23, m17, m22
+ paddd m3, m6, m21 ; t19a
+ psubd m6, m21, m6 ; t20a
+ REPX {pminsd x, m15}, m16, m23, m17, m22
+ paddd m21, m18, m4 ; t29
+ psubd m18, m4, m18 ; t26
+ REPX {pmaxsd x, m14}, m3, m6, m21, m18
+ psubd m4, m20, m2 ; t27a
+ paddd m20, m2 ; t28a
+ REPX {pminsd x, m15}, m3, m6, m21, m18
+ paddd m2, m7, m5 ; t18
+ psubd m7, m5 ; t21
+ REPX {pmaxsd x, m14}, m4, m20, m2, m7
+ REPX {pminsd x, m15}, m4, m20, m2, m7
+ REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8
+ REPX {paddd x, m13}, m18, m16, m4, m17
+ psubd m5, m18, m7 ; t21a
+ paddd m18, m7 ; t26a
+ psubd m7, m16, m19 ; t23a
+ paddd m16, m19 ; t24a
+ REPX {psrad x, 12 }, m5, m18, m7, m16
+ paddd m19, m4, m6 ; t27
+ psubd m4, m6 ; t20
+ psubd m6, m17, m8 ; t22
+ paddd m17, m8 ; t25
+ REPX {psrad x, 12 }, m19, m4, m6, m17
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m10, [pw_2896x8]
+ vpbroadcastd m11, [pw_1697x16]
+ vpbroadcastd m13, [pw_2048]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ lea r6, [strideq*9]
+ pxor m14, m14
+ cmp eobd, 151
+ jl .main
+ mov r4, dstq
+ call .main
+ add cq, 64*12
+ lea dstq, [r4+32]
+.main:
+ call .main_internal
+ add cq, 64*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+.main_internal:
+ mova m8, [cq+64* 0]
+ packssdw m8, [cq+64* 8]
+ mova m6, [cq+64* 1]
+ packssdw m6, [cq+64* 9]
+ mova m0, [cq+64* 2]
+ packssdw m0, [cq+64*10]
+ mova m2, [cq+64* 3]
+ packssdw m2, [cq+64*11]
+ REPX {pmulhrsw x, m10}, m8, m6, m0, m2
+ REPX {paddsw x, x }, m8, m6, m0, m2
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ pmulhrsw m4, m11, m8
+ pmulhrsw m9, m11, m6
+ paddsw m8, m8
+ paddsw m6, m6
+ REPX {mova [cq+64*x], m14}, 0, 1, 2, 3
+ paddsw m8, m4
+ paddsw m6, m9
+ pmulhrsw m4, m11, m0
+ pmulhrsw m9, m11, m2
+ paddsw m0, m0
+ paddsw m2, m2
+ REPX {mova [cq+64*x], m14}, 8, 9, 10, 11
+ paddsw m0, m4
+ paddsw m2, m9
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jl .fast
+ add cq, 64
+ cmp eobd, 543
+ jge .full
+ call .pass1_fast ; bottomright 16x16 zero
+ mov r6d, 16*12
+ jmp .lefthalf
+.full:
+ call .pass1
+ mov r6d, 16*28
+.lefthalf:
+ mova [cq+128* 0], m0
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m14
+ mova [cq+128* 5], m15
+ mova [cq+128* 6], m16
+ mova [cq+128* 7], m17
+ mova [cq+128* 8], m22
+ mova [cq+128* 9], m23
+ mova [cq+128*10], m24
+ mova [cq+128*11], m25
+ mova [cq+128*12], m26
+ mova [cq+128*13], m27
+ mova [cq+128*14], m28
+ mova [cq+128*15], m29
+ sub cq, 64
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call .pass1
+ lea r5, [o_base_8bpc]
+ mova m4, [cq+64+128* 0]
+ mova m5, [cq+64+128* 1]
+ mova m6, [cq+64+128* 2]
+ mova m7, [cq+64+128* 3]
+ mova m18, [cq+64+128* 4]
+ mova m19, [cq+64+128* 5]
+ mova m20, [cq+64+128* 6]
+ mova m21, [cq+64+128* 7]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ mova m14, [cq+64+128* 8]
+ mova m15, [cq+64+128* 9]
+ mova m16, [cq+64+128*10]
+ mova m17, [cq+64+128*11]
+ mova m18, [cq+64+128*12]
+ mova m19, [cq+64+128*13]
+ mova m20, [cq+64+128*14]
+ mova m21, [cq+64+128*15]
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ pxor m12, m12
+.right_zero_loop:
+ mova [cq+r6*8+64+128*3], m12
+ mova [cq+r6*8+64+128*2], m12
+ mova [cq+r6*8+64+128*1], m12
+ mova [cq+r6*8+64+128*0], m12
+ sub r6d, 16*4
+ jge .right_zero_loop
+ mov r6d, 16*28
+ jmp .end2
+.fast: ; topleft 16x16 nonzero
+ cmp eobd, 36
+ jl .fast2
+ call .pass1_fast
+ lea r5, [o_base_8bpc]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ jmp .end
+.fast2: ; topleft 8x8 nonzero
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*0]
+ mova ym1, [cq+128*4]
+ mova ym4, [cq+128*2]
+ mova ym5, [cq+128*6]
+ mova ym16, [cq+128*1]
+ mova ym2, [cq+128*5]
+ mova ym3, [cq+128*3]
+ mova ym17, [cq+128*7]
+ mov r6d, 16*4
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m22, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m5, m7 ; 4
+ punpckhqdq m24, m5, m7 ; 5
+ punpcklqdq m14, m3, m4 ; 2
+ punpckhqdq m23, m3, m4 ; 3
+ punpcklqdq m15, m6, m8 ; 6
+ punpckhqdq m25, m6, m8 ; 7
+ mova m10, m13
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.end:
+ pxor m12, m12
+.end2:
+ psubsw m9, m0, m29 ; out31
+ paddsw m0, m29 ; out0
+ psubsw m29, m1, m28 ; out30
+ paddsw m1, m28 ; out1
+ psubsw m28, m2, m27 ; out29
+ paddsw m2, m27 ; out2
+ psubsw m27, m3, m26 ; out28
+ paddsw m3, m26 ; out3
+ psubsw m26, m4, m25 ; out27
+ paddsw m4, m25 ; out4
+ psubsw m25, m5, m24 ; out26
+ paddsw m5, m24 ; out5
+ psubsw m24, m6, m23 ; out25
+ paddsw m6, m23 ; out6
+ psubsw m23, m7, m22 ; out24
+ paddsw m7, m22 ; out7
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ psubsw m22, m0, m21 ; out23
+ paddsw m0, m21 ; out8
+ psubsw m21, m1, m20 ; out22
+ paddsw m1, m20 ; out9
+ psubsw m20, m2, m19 ; out21
+ paddsw m2, m19 ; out10
+ psubsw m19, m3, m18 ; out20
+ paddsw m3, m18 ; out11
+ psubsw m18, m4, m17 ; out19
+ paddsw m4, m17 ; out12
+ psubsw m17, m5, m16 ; out18
+ paddsw m5, m16 ; out13
+ psubsw m16, m6, m15 ; out17
+ paddsw m6, m15 ; out14
+ psubsw m15, m7, m14 ; out16
+ paddsw m7, m14 ; out15
+.zero_loop:
+ mova [cq+r6*8+128*3], m12
+ mova [cq+r6*8+128*2], m12
+ mova [cq+r6*8+128*1], m12
+ mova [cq+r6*8+128*0], m12
+ sub r6d, 16*4
+ jge .zero_loop
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
+ pmulhrsw m0, m11, m15
+ pmulhrsw m1, m11, m16
+ pmulhrsw m2, m11, m17
+ pmulhrsw m3, m11, m18
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m19
+ pmulhrsw m1, m11, m20
+ pmulhrsw m2, m11, m21
+ pmulhrsw m3, m11, m22
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m23
+ pmulhrsw m1, m11, m24
+ pmulhrsw m2, m11, m25
+ pmulhrsw m3, m11, m26
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m27
+ pmulhrsw m1, m11, m28
+ pmulhrsw m2, m11, m29
+ pmulhrsw m3, m11, m9
+ WIN64_RESTORE_XMM
+ vzeroupper
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
+.pass1_fast:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mov r6d, 16*12
+ call m(idct_8x16_internal_10bpc).main_fast
+ mova m16, [cq+128* 2]
+ mova m17, [cq+128* 6]
+ mova m18, [cq+128*10]
+ mova m19, [cq+128*14]
+ call m(idct_16x16_internal_10bpc).main_fast
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
+ jmp .pass1_end
+.pass1:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mova m4, [cq+128*16]
+ mova m5, [cq+128*20]
+ mova m6, [cq+128*24]
+ mova m7, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+128* 2]
+ mova m17, [cq+128* 6]
+ mova m18, [cq+128*10]
+ mova m19, [cq+128*14]
+ mova m20, [cq+128*18]
+ mova m21, [cq+128*22]
+ mova m22, [cq+128*26]
+ mova m23, [cq+128*30]
+ call m(idct_16x16_internal_10bpc).main
+ call .pass1_load_spill
+ mova m16, [cq+128*17]
+ mova m17, [cq+128*19]
+ mova m18, [cq+128*21]
+ mova m19, [cq+128*23]
+ mova m20, [cq+128*25]
+ mova m21, [cq+128*27]
+ mova m22, [cq+128*29]
+ mova m23, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main
+.pass1_end:
+ vpbroadcastd m11, [o(pd_2)]
+ lea r4, [cq+128*8]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
+ punpckhqdq m22, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m24, m2, m1 ; 5
+ punpcklqdq m1, m2, m1 ; 4
+ punpcklqdq m2, m14, m18 ; 8
+ punpckhqdq m26, m14, m18 ; 9
+ punpcklqdq m14, m15, m4 ; 2
+ punpckhqdq m23, m15, m4 ; 3
+ punpckhqdq m25, m3, m21 ; 7
+ punpcklqdq m15, m3, m21 ; 6
+ punpckhqdq m28, m6, m17 ; 13
+ punpcklqdq m3, m6, m17 ; 12
+ punpckhqdq m27, m5, m16 ; 11
+ punpcklqdq m16, m5, m16 ; 10
+ punpckhqdq m29, m7, m8 ; 15
+ punpcklqdq m17, m7, m8 ; 14
+ ret
+.pass1_load_spill:
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ mova [cq+128* 0], m0
+ mova m0, [cq+128* 1]
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m4
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova [cq+128* 5], m5
+ mova [cq+128* 6], m6
+ mova [cq+128* 7], m7
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ mova [cq+128* 8], m23
+ mova [cq+128* 9], m22
+ mova [cq+128*10], m21
+ mova [cq+128*11], m20
+ mova [cq+128*12], m19
+ mova [cq+128*13], m18
+ mova [cq+128*14], m17
+ mova [cq+128*15], m16
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m13, [pw_8192]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ lea r6, [strideq*9]
+ cmp eobd, 136
+ jl .main
+ mov r4, dstq
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+ call .main
+ add cq, 128*12-64
+ lea dstq, [r4+32]
+ cmp eobd, 543
+ jl .main
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+.main:
+ call .main_internal
+ add cq, 128*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+.main_internal:
+ mova m8, [cq+128* 0]
+ packssdw m8, [cq+128* 8]
+ mova m6, [cq+128* 1]
+ packssdw m6, [cq+128* 9]
+ mova m0, [cq+128* 2]
+ packssdw m0, [cq+128*10]
+ mova m2, [cq+128* 3]
+ packssdw m2, [cq+128*11]
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+
+ PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ call .pass1
+ cmp eobd, 151
+ jge .full
+ lea r5, [o_base_8bpc]
+
+ punpckhwd m22, m0, m0
+ punpckhwd m23, m1, m1
+ punpckhwd m24, m2, m2
+ punpckhwd m25, m3, m3
+ punpckhwd m26, m4, m4
+ punpckhwd m27, m5, m5
+ punpckhwd m28, m6, m6
+ punpckhwd m29, m7, m7
+ punpcklwd m21, m1, m1
+ punpcklwd m14, m3, m3
+ punpcklwd m18, m5, m5
+ punpcklwd m15, m7, m7
+ pxor m9, m9
+ punpcklwd m9, m9, m0
+ punpcklwd m8, m2, m2
+ punpcklwd m7, m4, m4
+ punpcklwd m1, m6, m6
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+
+ pxor m12, m12
+ mov r3d, 64*3
+.zero_loop:
+ REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
+ sub r3d, 64
+ jge .zero_loop
+
+ jmp .pass2_end
+.full:
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 64
+ call .pass1
+ sub cq, 64
+ mova m22, [cq+128*0] ; 0 1
+ mova m23, [cq+128*1] ; 2 3
+ mova m24, [cq+128*2] ; 4 5
+ mova m25, [cq+128*3] ; 6 7
+ mova m26, [cq+128*4] ; 8 9
+ mova m27, [cq+128*5] ; 10 11
+ mova m28, [cq+128*6] ; 12 13
+ mova m29, [cq+128*7] ; 14 15
+ mova [cq+64* 8], m0
+ mova [cq+64* 9], m1
+ mova [cq+64*10], m2
+ mova [cq+64*11], m3
+ mova [cq+64*12], m4
+ mova [cq+64*13], m5
+ mova [cq+64*14], m6
+ mova [cq+64*15], m7
+ lea r5, [o_base_8bpc]
+
+ punpcklwd m20, m1, m1
+ punpcklwd m16, m3, m3
+ punpcklwd m19, m5, m5
+ punpcklwd m17, m7, m7
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m5, m2, m2 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m4, m4 ; 24
+ punpcklwd m4, m6, m6 ; 28
+ pxor m9, m9
+ punpcklwd m6, m9, m0 ; __ 16
+ mova m0, m4
+ punpcklwd m9, m9, m22 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+ mova m21, [cq+64*15]
+ mova m14, [cq+64* 8]
+ mova m17, [cq+64*11]
+ mova m18, [cq+64*12]
+ mova m19, [cq+64*13]
+ mova m16, [cq+64*10]
+ mova m15, [cq+64* 9]
+ mova m20, [cq+64*14]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+
+ pxor m12, m12
+ mov r3d, 32*7
+.full_zero_loop:
+ REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
+ sub r3d, 32
+ jge .full_zero_loop
+
+ jmp .pass2_end
+.fast:
+ mova ym0, [cq+128*0]
+ mova ym2, [cq+128*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*2]
+ mova ym3, [cq+128*6]
+ mova ym4, [cq+128*1]
+ mova ym5, [cq+128*3]
+ mova ym6, [cq+128*5]
+ mova ym7, [cq+128*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(idct8x32p)]
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m6, [dup16_perm]
+ vpermb m0, m8, m0
+ vpermb m2, m8, m2
+ vprold m8, 16
+ vpermb m1, m8, m1
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m21, m4, m2
+ punpckhdq m14, m4, m2
+ punpckldq m18, m0, m1
+ punpckhdq m15, m0, m1
+ vpord m7, m6, [o(pb_32)] {1to16}
+ vpermb m22, m7, m21 ; 1
+ pmovzxwd m9, ym21 ; 0
+ vpermb m8, m6, m18 ; 4
+ vpermb m24, m7, m18 ; 5
+ vpermb m21, m6, m14 ; 2
+ vpermb m23, m7, m14 ; 3
+ vpermb m14, m6, m15 ; 6
+ vpermb m25, m7, m15 ; 7
+ lea r5, [o_base_8bpc]
+ pslld m9, 16
+
+ pxor m7, m7
+ REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
+
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+
+ pxor m12, m12
+ REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
+.pass2_end:
+ movshdup m30, [permC]
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ psrlq m31, m30, 8
+ vpermq m8, m30, m0
+ vpermq m9, m31, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m2
+ vpermq m9, m31, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m4
+ vpermq m9, m31, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m6
+ vpermq m9, m31, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ mova m1, [rsp+mmsize*0]
+ mova m2, [rsp+mmsize*1]
+ mova m3, [rsp+mmsize*2]
+ mova m4, [rsp+mmsize*3]
+ mova m5, [rsp+mmsize*4]
+ mova m6, [rsp+mmsize*5]
+ mova m7, [rsp+mmsize*6]
+ mova m8, [rsp+mmsize*7]
+
+ paddsw m0, m1, m21
+ psubsw m21, m1, m21
+ paddsw m1, m2, m20
+ psubsw m20, m2, m20
+ paddsw m2, m3, m19
+ psubsw m19, m3, m19
+ paddsw m3, m4, m18
+ psubsw m18, m4, m18
+ paddsw m4, m5, m17
+ psubsw m17, m5, m17
+ paddsw m5, m6, m16
+ psubsw m16, m6, m16
+ paddsw m6, m7, m15
+ psubsw m15, m7, m15
+ paddsw m7, m8, m14
+ psubsw m14, m8, m14
+
+ vpermq m8, m30, m0
+ vpermq m9, m31, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m2
+ vpermq m9, m31, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m4
+ vpermq m9, m31, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m6
+ vpermq m9, m31, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ vpermq m8, m30, m14
+ vpermq m9, m31, m15
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m16
+ vpermq m9, m31, m17
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m18
+ vpermq m9, m31, m19
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m20
+ vpermq m9, m31, m21
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ vpermq m8, m30, m22
+ vpermq m9, m31, m23
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m24
+ vpermq m9, m31, m25
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m26
+ vpermq m9, m31, m27
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m28
+ vpermq m9, m31, m29
+ call m(idct_16x8_internal_10bpc).write_16x4
+ RET
+.pass1:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+128* 1]
+ mova m17, [cq+128* 3]
+ mova m18, [cq+128* 5]
+ mova m19, [cq+128* 7]
+ mova m20, [cq+128* 9]
+ mova m21, [cq+128*11]
+ mova m22, [cq+128*13]
+ mova m23, [cq+128*15]
+ call m(idct_16x16_internal_10bpc).main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp m(idct_16x16_internal_10bpc).main_end3
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm
new file mode 100644
index 0000000000..3833e17c99
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@@ -0,0 +1,8135 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2017-2021, The rav1e contributors
+; Copyright © 2020, Nathan Egge
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+%macro COEF 1-2
+pd_%1: times 4 dd %1
+%if %0 == 2
+pd_m%1: times 4 dd -%1
+%endif
+%endmacro
+
+COEF 201
+COEF 401
+COEF 601, 1
+COEF 799
+COEF 995
+COEF 1189, 1
+COEF 1380, 1
+COEF 1567
+COEF 1751
+COEF 1931
+COEF 2106, 1
+COEF 2276, 1
+COEF 2440
+COEF 2598, 1
+COEF 2751, 1
+COEF 2896
+COEF 3035
+COEF 3166
+COEF 3290
+COEF 3406
+COEF 3513
+COEF 3612
+COEF 3703
+COEF 3784
+COEF 3857
+COEF 3920
+COEF 3973
+COEF 4017
+COEF 4052
+COEF 4076
+COEF 4091
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%if ARCH_X86_32
+pd_1: times 4 dd 1
+%endif
+pd_2: times 4 dd 2
+pw_5: times 8 dw 5
+pd_1321: times 4 dd 1321
+pd_2482: times 4 dd 2482
+pd_m3344: times 4 dd -3344
+pd_2048: times 4 dd 2048
+pw_4x2048_4xm2048: times 4 dw 2048
+ times 4 dw -2048
+pw_4xm2048_4x2048: times 4 dw -2048
+ times 4 dw 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pd_3803: times 4 dd 3803
+pw_4096: times 8 dw 4096
+pd_5793: times 4 dd 5793
+pd_6144: times 4 dd 6144
+pw_8192: times 8 dw 8192
+pd_10240: times 4 dd 10240
+pd_11586: times 4 dd 11586
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_1697x16: times 8 dw 1697*16
+pw_16384: times 8 dw 16384
+pixel_10bpc_max: times 8 dw 0x03ff
+
+pw_1567_3784: times 4 dw 1567, 3784
+pw_m3784_1567: times 4 dw -3784, 1567
+pw_2896_2896: times 4 dw 2896, 2896
+pw_m2896_2896: times 4 dw -2896, 2896
+
+clip_18b_min: times 4 dd -0x20000
+clip_18b_max: times 4 dd 0x1ffff
+
+idct64_mul_16bpc:
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
+cextern iadst_4x4_internal_8bpc_ssse3.main
+cextern idct_4x8_internal_8bpc_ssse3.main
+cextern iadst_4x8_internal_8bpc_ssse3.main
+cextern idct_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x4_internal_8bpc_ssse3.main
+cextern iadst_8x4_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
+cextern iadst_8x8_internal_8bpc_ssse3.main
+cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x32_internal_8bpc_ssse3.main
+cextern idct_8x32_internal_8bpc_ssse3.main_fast
+cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
+cextern idct_16x64_internal_8bpc_ssse3.main
+cextern idct_16x64_internal_8bpc_ssse3.main_fast
+
+tbl_4x16_2d: db 0, 13, 29, 45
+tbl_4x16_h: db 0, 16, 32, 48
+tbl_4x16_v: db 0, 4, 8, 12
+
+tbl_8x16_2d: db 0, 14, 30, 46
+tbl_8x16_v: db 0, 4, 8, 12
+tbl_8x16_h: db 0, 32, 64, 96
+
+tbl_16x16_2d: db 0, 10, 36, 78
+tbl_16x16_v: db 0, 4, 8, 12
+tbl_16x16_h: db 0, 64, 128, 192
+
+tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
+
+tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
+
+tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
+tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
+
+tbl_Nx32_odd_offset: db 2*16, 2*23
+ db 2*20, 2*19
+ db 2*18, 2*21
+ db 2*22, 2*17
+ db 2*30, 2*25
+ db 2*26, 2*29
+ db 2*28, 2*27
+ db 2*24, 2*31
+
+tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
+ db 2* 8, 2*40, 2*23, 2*38
+ db 2* 1, 2*36, 2*20, 2*42
+ db 2* 9, 2*44, 2*19, 2*34
+ db 2* 2, 2*60, 2*18, 2*50
+ db 2*10, 2*52, 2*21, 2*58
+ db 2* 3, 2*56, 2*22, 2*54
+ db 2*11, 2*48, 2*17, 2*62
+
+SECTION .text
+
+%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
+%define m(x) m_suffix(x, SUFFIX)
+
+; This refers to the first function in itx_sse i.e. the start of the text section
+; which is needed as a base pointer for constants.
+%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r6-$$+x ; PIC
+%endif
+
+%macro IWHT4_1D 0
+ ; m0 = in0, m1 = in1, m2 = in2, m3 = in3
+ paddd m0, m1 ; in0 += in1
+ psubd m4, m2, m3 ; tmp0 = in2 - in3
+ psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1
+ psrad m5, 1
+ psubd m2, m5, m1 ; in2 = tmp1 - in1
+ psubd m5, m3 ; in1 = tmp1 - in3
+ psubd m0, m5 ; in0 -= in1
+ paddd m4, m2 ; in3 = tmp0 + in2
+ ; m0 = out0, m1 = in1, m2 = out2, m3 = in3
+ ; m4 = out3, m5 = out1
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ REPX {psrad x, 2}, m0, m1, m2, m3
+ IWHT4_1D
+ punpckldq m1, m0, m5
+ punpckhdq m3, m0, m5
+ punpckldq m5, m2, m4
+ punpckhdq m2, m4
+ punpcklqdq m0, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m4, m3, m2
+ punpckhqdq m3, m2
+ mova m2, m4
+ IWHT4_1D
+ packssdw m0, m4 ; low: out3, high: out0
+ packssdw m2, m5 ; low: out2, high: out1
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ lea r2, [dstq+strideq*2]
+ movq m1, [dstq+strideq*0]
+ movhps m1, [r2 +strideq*1]
+ movq m3, [r2 +strideq*0]
+ movhps m3, [dstq+strideq*1]
+ movd m5, bdmaxm
+ pshuflw m5, m5, q0000 ; broadcast
+ punpcklqdq m5, m5 ; broadcast
+ paddsw m0, m1
+ paddsw m2, m3
+ pmaxsw m0, m4
+ pmaxsw m2, m4
+ pminsw m0, m5
+ pminsw m2, m5
+ movhps [r2 +strideq*1], m0 ; write out0
+ movhps [dstq+strideq*1], m2 ; write out1
+ movq [r2 +strideq*0], m2 ; write out2
+ movq [dstq+strideq*0], m0 ; write out3
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 2 = inv_dst1, 4 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+; %1 dst/src[1]
+; %2 dst/src[2]
+; %3 tmp[1]
+; %4 tmp[2]
+; %5 tmp[3]
+; %6 rnd
+; %7 coef[1]
+; %8 coef[2]
+; %9 flags
+%ifnidn %7,%8 ; optimize when coef1 == coef2
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+ mova m%3, [o(pd_%8)]
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+ mova m%5, [o(pd_%7)]
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 4 ; invert dst2
+ paddd m%4, m%2
+ psubd m%2, m%6, m%4
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%4, m%6
+%else
+ paddd m%1, m%6
+%endif
+%endif
+%ifnidn %7,%8
+ paddd m%2, m%4
+%else
+ mova m%3, m%2
+ paddd m%2, m%1
+%endif
+%endif
+%if %9 & 2 ; invert dst1
+ psubd m%3, m%1
+ paddd m%1, m%3, m%6
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%1, m%6
+%endif
+%endif
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
+cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_16bpc)
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+ call %%p1
+ RET
+%%end:
+%else
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 0, 4x4
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ pxor m3, m3
+ punpcklqdq m0, m0
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ; butterfly rotation
+ ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3
+ ; Hadamard rotation
+ psubd m%5, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%3, m%4
+ psubd m%3, m%4
+ ; %1 (src1) = out0
+ ; %2 (src2) = out1
+ ; %3 (src3) = out3
+ ; $5 (tmp1) = out2
+%endmacro
+
+INIT_XMM sse4
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ call .pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass1_main:
+ IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5
+ ret
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pmaddwd m4, m2, [o(pw_m3784_1567)]
+ pmaddwd m2, [o(pw_1567_3784)]
+ pmaddwd m0, m1, [o(pw_m2896_2896)]
+ pmaddwd m1, [o(pw_2896_2896)]
+ REPX {paddd x, m5}, m4, m2, m0, m1
+ packssdw m5, m5 ; pw_2048
+ REPX {psrad x, 12}, m4, m2, m0, m1
+ packssdw m2, m4 ; t3 t2
+ packssdw m1, m0 ; t0 t1
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*1]
+ movhps m3, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movhps [r5 +strideq*0], m1
+ movq [r5 +strideq*1], m1
+ RET
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+.end:
+ mova m4, [o(pw_2048)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+ALIGN function_align
+.main:
+ mova m1, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [cq+16*0]
+ lea r3, [cq+16*1]
+.main2:
+ mova m0, [o(pd_1321)] ; SINPI_1_9
+ mova m2, [o(pd_2482)] ; SINPI_2_9
+ mova m6, [o(pd_3803)] ; SINPI_4_9
+ pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2]
+ pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3]
+ pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2]
+ pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0]
+ psubd m1, m3 ; T[2] - T[3]
+ pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3]
+ pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0]
+ paddd m0, m6 ; s[0] += s[3]
+ paddd m0, m3 ; s[0] += s[5]
+ mova m3, [o(pd_m3344)] ; -SINPI_3_9
+ psubd m2, m4 ; s[1] -= s[4]
+ psubd m2, m7 ; s[1] -= s[6]
+ psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0]
+ pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7
+ pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048
+ paddd m4, m0, m2 ; x[3] = s[0] + s[1]
+ psubd m2, m3 ; x[1] = s[1] + s[3]
+ psubd m0, m3 ; x[0] = s[0] + s[3]
+ paddd m4, m3 ; x[3] -= s[3]
+ paddd m2, m5 ; x[1] + 2048
+ REPX {psrad x, 12}, m0, m2, m1, m4
+ ret
+
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_16bpc).main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_2048)]
+ movq m3, [dstq+strideq*1]
+ movhps m3, [dstq+strideq*0]
+ lea r5, [dstq+strideq*2]
+ movq m2, [r5 +strideq*1]
+ movhps m2, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movhps [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m1
+ movhps [r5 +strideq*0], m0
+ movq [r5 +strideq*1], m0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m3, [o(pd_5793)]
+ pmulld m0, m3, [cq+16*0]
+ pmulld m1, m3, [cq+16*1]
+ pmulld m2, m3, [cq+16*2]
+ pmulld m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ ; transpose
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ mova m4, [o(pw_1697x8)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ pmulhrsw m3, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m3
+ paddsw m1, m4
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m4, [o(pixel_10bpc_max)]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pxor m5, m5
+ mova [cq+16*0], m5
+ mova [cq+16*1], m5
+ mova [cq+16*2], m5
+ mova [cq+16*3], m5
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m5
+ pmaxsw m1, m5
+ pminsw m0, m4
+ pminsw m1, m4
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+
+%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 4x8
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity, 9
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ mova m3, [o(pd_2896)]
+ pmulld m0, m3, [cq+32*0+r5]
+ pmulld m1, m3, [cq+32*1+r5]
+ pmulld m2, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ call m(idct_4x4_internal_16bpc).pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova [cq+32*1+16], m4
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*0+16]
+ mova m6, [cq+32*1+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
+ ; m0-3 is now out0/1,3/2,4/5,7/6
+ mova m4, [o(pw_2048)]
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+.end:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ mova m7, [o(pixel_10bpc_max)]
+ lea r2, [strideq*3]
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r2]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r2]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r2 ], m3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity, 9
+
+cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .pass1_main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*2+16]
+ mova m6, [cq+32*3+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass1_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+ lea r3, [cq+32*1+16]
+.loop_pass1:
+ mova m0, [o(pd_2048)]
+ mova m3, [o(pd_2896)]
+ pmulld m5, m3, [cq+32*0+r5]
+ pmulld m2, m3, [cq+32*1+r5]
+ pmulld m1, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m0}, m5, m2, m1, m3
+ REPX {psrad x, 12}, m5, m2, m1, m3
+ mova [r3], m2
+ call m(iadst_4x4_internal_16bpc).main2
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*2+16], m0
+ mova [cq+32*3+16], m1
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ ret
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_4x2048_4xm2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity, 9
+
+cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_16bpc).pass1_main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ mova m6, [cq+32*2+16]
+ mova m2, [cq+32*3+16]
+ punpcklwd m4, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m4, [o(pw_4xm2048_4x2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity, 3
+
+cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+ mova m4, [o(pd_2896)]
+ mova m6, [o(pd_5793)]
+ ; clear m7 in case we skip the bottom square
+ pxor m7, m7
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 16
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 16
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ pmulld m0, m4, [cq+32*0+r5]
+ pmulld m1, m4, [cq+32*1+r5]
+ pmulld m2, m4, [cq+32*2+r5]
+ pmulld m3, m4, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova m7, m2
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4
+ mova m2, [cq+32*0+16]
+ punpckhwd m4, m2, m7
+ punpcklwd m2, m7
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+ INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity, v
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_2048)]
+.loop_pass1:
+ mova m0, [cq+64*0+r5]
+ mova m1, [cq+64*1+r5]
+ mova m2, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(idct_4x4_internal_16bpc).pass1_main
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m1, m4, m2
+ REPX {psrad x, 1}, m0, m1, m4, m2
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m2, [cq+64*0+16]
+ mova m3, [cq+64*1+16]
+ mova m4, [cq+64*0+32]
+ mova m5, [cq+64*1+32]
+ mova m6, [cq+64*0+48]
+ mova m7, [cq+64*1+48]
+ ; m0-7 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
+ ; m0-6 is out0-13 [with odd registers having inversed output]
+ ; [coeffq+16*7] has out15/14
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [cq+16*7]
+ REPX {shufps x, x, q1032}, m1, m3, m5, m7
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova [cq+16*2], m6
+ mova [cq+16*3], m7
+.end:
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ mova m7, [o(pixel_10bpc_max)]
+ mov r5d, 2
+ lea r3, [strideq*3]
+.loop:
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r3]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r3]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r3 ], m3
+ dec r5d
+ jz .end2
+ lea dstq, [dstq+strideq*8]
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
+ jmp .loop
+.end2:
+ RET
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity, v
+
+cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r6+r5]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
+ ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out4/11,5/10,6/9,7/8
+ ; m0/3/6/1 = out0/15,3/12,1/14,2/13
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movhps [cq+0*8], m4
+ movhps [cq+1*8], m2
+ movhps [cq+2*8], m5
+ movhps [cq+3*8], m7
+ movhps [cq+4*8], m3
+ movhps [cq+5*8], m1
+ movhps [cq+6*8], m6
+ movhps [cq+7*8], m0
+ punpcklqdq m0, m6
+ punpcklqdq m1, m3
+ punpcklqdq m3, m2, m4
+ punpcklqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity, v
+
+cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out3 out2
+ packssdw m1, m4 ; out1 out0
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
+ ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out11/4,10/5,9/6,8/7
+ ; m0/3/6/1 = out15/0,12/3,14/1,13/2
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movq [cq+0*8], m4
+ movq [cq+1*8], m2
+ movq [cq+2*8], m5
+ movq [cq+3*8], m7
+ movq [cq+4*8], m3
+ movq [cq+5*8], m1
+ movq [cq+6*8], m6
+ movq [cq+7*8], m0
+ punpckhqdq m0, m6
+ punpckhqdq m1, m3
+ punpckhqdq m3, m2, m4
+ punpckhqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN identity, dct, h
+INV_TXFM_4X16_FN identity, adst, h
+INV_TXFM_4X16_FN identity, flipadst, h
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_6144)]
+ mova m4, [o(pd_5793)]
+.loop_pass1:
+ pmulld m0, m4, [cq+64*0+r5]
+ pmulld m1, m4, [cq+64*1+r5]
+ pmulld m2, m4, [cq+64*2+r5]
+ pmulld m3, m4, [cq+64*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+ mova [cq+16*4], m0
+ mova [cq+16*5], m1
+ mova [cq+16*6], m2
+ mova [cq+16*7], m7
+ mova m0, [o(pw_1697x16)]
+ mova m7, [o(pw_2048)]
+ pmulhrsw m1, m0, m4
+ pmulhrsw m2, m0, m5
+ REPX {paddsw x, x}, m4, m5
+ paddsw m4, m1
+ paddsw m5, m2
+ REPX {pmulhrsw x, m7}, m4, m5
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova m4, [cq+16*7]
+ pmulhrsw m1, m0, m6
+ pmulhrsw m2, m0, m4
+ REPX {paddsw x, x}, m6, m4
+ paddsw m6, m1
+ paddsw m4, m2
+ REPX {pmulhrsw x, m7}, m6, m4
+ mova [cq+16*2], m6
+ mova [cq+16*3], m4
+ mova m4, [cq+16*4]
+ mova m1, [cq+16*5]
+ mova m2, [cq+16*6]
+ pmulhrsw m5, m0, m2
+ pmulhrsw m6, m0, m3
+ REPX {paddsw x, x}, m2, m3
+ paddsw m2, m5
+ paddsw m3, m6
+ pmulhrsw m6, m0, m1
+ pmulhrsw m0, m4
+ REPX {paddsw x, x}, m1, m4
+ paddsw m1, m6
+ paddsw m0, m4
+ REPX {pmulhrsw x, m7}, m2, m3, m1, m0
+ jmp m(idct_4x16_internal_16bpc).end
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 8x4, 15
+%else
+ INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+.pass1_entry:
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%else
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ mova m4, [cq+4*16]
+ mova m5, [cq+5*16]
+ mova m6, [cq+6*16]
+ mova m7, [cq+7*16]
+ call .rect2_mul
+ call r5
+ call .transpose4x8packed
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.transpose4x8packed:
+ ; transpose
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m6, m0, m4
+ punpcklwd m0, m4
+
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m4, m6, m2
+ punpcklwd m6, m2
+
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ ret
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.rect2_mul:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [r3], m7
+ mova m7, [o(pd_2896)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+%endif
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+%if ARCH_X86_64
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ pmulld m0, m14
+ pxor m4, m4
+ jmp .main_pass1_fast2
+.main_pass1:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ REPX {pmulld x, m14}, m0, m4
+.main_pass1_fast2:
+ REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7
+ REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m14}, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+%else
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ mova m4, [o(pd_2048)]
+ mova [r3+0*16], m2
+ REPX {paddd x, m4}, m5, m3, m7, m1
+ REPX {psrad x, 12}, m5, m3, m7, m1
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ pmulld m5, m0, [o(pd_2896)]
+ mova m0, m4
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m1, m2, m7, m4
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3 }, m1, m2, m7, m4
+ mova [r3+3*16], m2
+ mova [r3+1*16], m4
+ pxor m4, m4
+ mova m2, [r3+0*16]
+ mova m3, [o(pd_2896)]
+ jmp .main_pass1_fast2
+.main_pass1:
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m4
+ mova [r3+3*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6 }, m1, m2, m7, m4
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6 }, m1, m2, m7, m4
+ mova m6, [r3+3*16]
+ mova [r3+3*16], m2
+ mova m2, [r3+1*16]
+ mova [r3+1*16], m4
+
+ ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3
+ mova m3, [o(pd_2896)]
+ mova m5, [r3+0*16]
+ mova m4, [r3+2*16]
+ REPX {pmulld x, m3 }, m5, m4
+.main_pass1_fast2:
+ REPX {paddd x, m0 }, m2, m6
+ REPX {psrad x, 12 }, m2, m6
+ REPX {pmulld x, m3 }, m7, m1
+ paddd m7, m0
+ paddd m0, m5
+
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1 }, m0, m6, m5, m3
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1 }, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ mova [r3+0*16], m6
+ mova m6, [r3+1*16]
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ mova m6, [r3+3*16]
+ psubd m4, m3, m6 ; out4
+ paddd m3, m6 ; out3
+ mova m6, [r3+0*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
+.end:
+ lea r3, [strideq*3]
+ call .round2_and_write_8x4
+ REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ RET
+.round2_and_write_8x4:
+ pxor m6, m6
+ mova m5, [o(pixel_10bpc_max)]
+ mova m4, [o(pw_2048)]
+.round1_and_write_8x4:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+.write_8x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.main_pass1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
+ ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ mova m8, [o(pd_2896)]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m12}, m5, m3, m2, m9
+ REPX {pminsd x, m13}, m5, m3, m2, m9
+ REPX {pmulld x, m14}, m5, m3, m2, m9
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m2, m9 ; (t6 - t7) * 2896
+ paddd m2, m9 ; (t6 + t7) * 2896
+ ret
+.round:
+
+ ; m0=out0,m1=-out1,m6=out6,m7=-out7
+
+ pcmpeqd m8, m8
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psubd x, m8 }, m1, m7
+ REPX {paddd x, m11}, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m5, [o(pd_2048)]
+
+ ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova m1, [r3+3*16]
+ mova [r3+3*16], m7
+ ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ psubd m7, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ mova [r3+0*16], m7
+ mova m5, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m4, m1, m5 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
+ mova [r3+1*16], m7
+ mova m7, [o(clip_18b_max)]
+ pmaxsd m3, [r3+0*16]
+ REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
+ pminsd m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m6, m3 ; t7
+ paddd m6, m3 ; out6
+ mova [r3+3*16], m6
+ mova m0, [r3+0*16]
+ mova m6, [r3+1*16]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m6 ; t2
+ paddd m0, m6 ; out0
+ psubd m6, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ mova m4, [o(clip_18b_min)]
+ REPX {pmaxsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(clip_18b_max)]
+ REPX {pminsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(pd_2896)]
+ REPX {pmulld x, m4 }, m5, m3, m6, m2
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m6, m2 ; (t6 - t7) * 2896
+ paddd m2, m6 ; (t6 + t7) * 2896
+ ret
+.round:
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0
+ mova m6, [o(pd_2048)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+
+ mova m6, [r3+3*16]
+ mova m0, [r3+2*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x4_internal_16bpc).round
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ ret
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+ add dstq, r3
+ neg strideq
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x4_internal_16bpc).end
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16
+%else
+ INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 2
+.end:
+ add r5d, 384
+ sar r5d, 9
+.end2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+.loop:
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ lea dstq, [dstq+strideq*4]
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity, 6
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ mov [rsp+4*16+1*gprsize], r1
+%else
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [o(.pass1_main)]
+
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*32+r5]
+ mova m1, [cq+1*32+r5]
+ mova m2, [cq+2*32+r5]
+ mova m3, [cq+3*32+r5]
+ mova m4, [cq+4*32+r5]
+ mova m5, [cq+5*32+r5]
+ mova m6, [cq+6*32+r5]
+ mova m7, [cq+7*32+r5]
+ call t0
+
+ test r5d, r5d
+ jz .end_pass1
+
+ mova [cq+0*32+16], m0
+ mova [cq+1*32+16], m1
+ mova [cq+2*32+16], m2
+ mova [cq+3*32+16], m3
+
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ mov r1, [rsp+4*16+1*gprsize]
+%endif
+ jmp tx2q
+.pass1_main:
+ call m(idct_8x4_internal_16bpc).main_pass1
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+.pack_and_transpose:
+ packssdw m2, m3
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw m4, m5
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+.zero:
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%undef mzero
+ RET
+
+ ; round (rounded right-shift by 5) before writing
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+ ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
+%if ARCH_X86_32
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+%if ARCH_X86_32
+.round4_and_write_8x8:
+%endif
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize*2]
+%if ARCH_X86_64
+ jmp .write_8x8
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+%endif
+
+ ; m0-7 have to-be-written data [pre-rounded]
+ ; on x86-64, m9-10 contain a zero/pixel_max
+ ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
+ ; r0,1,3 contain dstq/strideq/stride3q
+ ; r5 is a scratch register
+.write_8x8:
+ lea r5, [dstq+strideq*4]
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ paddw m4, [r5 +strideq*0]
+ paddw m5, [r5 +strideq*1]
+ paddw m6, [r5 +strideq*2]
+ paddw m7, [r5 +r3]
+%if ARCH_X86_64
+ REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [rsp+gprsize*2], m7
+ pxor m7, m7
+ REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmaxsw m7, [rsp+gprsize*2]
+ mova [rsp+gprsize*2], m7
+ mova m7, [o(pixel_10bpc_max)]
+ REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsw m7, [rsp+gprsize*2]
+%endif
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ mova [r5 +strideq*0], m4
+ mova [r5 +strideq*1], m5
+ mova [r5 +strideq*2], m6
+ mova [r5 +r3 ], m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity, 6
+
+cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call .round
+ jmp m(idct_8x8_internal_16bpc).pack_and_transpose
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ REPX {psubd x, m8 }, m0, m6
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ REPX {psubd x, m8 }, m1, m7
+ mova m8, [o(pd_6144)]
+ REPX {paddd x, m8 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+%else
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0 ; -1
+ mova m6, [o(pd_6144)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m1, m7
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+
+ mova m0, [r3+2*16]
+ psrld m6, 12 ; +1
+ paddd m0, m6
+ paddd m6, [r3+3*16]
+ REPX {psrad x, 1 }, m0, m6
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+ jmp m(idct_8x8_internal_16bpc).zero
+
+ ; round (rounded right-shift by 5) before writing; odd registers are negated
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+%if ARCH_X86_64
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8 }, m0, m2, m4, m6
+ REPX {pmulhrsw x, m11}, m1, m3, m5, m7
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+%else
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova m7, [o(pw_m2048)]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize*2]
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity, 6
+
+cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x8_internal_16bpc).round
+ ; invert registers
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+ lea dstq, [dstq+strideq*8]
+ sub dstq, strideq
+ neg strideq
+ jmp m(iadst_8x8_internal_16bpc).pass2
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+0*32]
+ mova m1, [cq+1*32]
+ mova m2, [cq+2*32]
+ mova m3, [cq+3*32]
+ mova m4, [cq+4*32]
+ mova m5, [cq+5*32]
+ mova m6, [cq+6*32]
+ mova m7, [cq+7*32]
+ packssdw m0, [cq+0*32+16]
+ packssdw m1, [cq+1*32+16]
+ packssdw m2, [cq+2*32+16]
+ packssdw m3, [cq+3*32+16]
+ packssdw m4, [cq+4*32+16]
+ packssdw m5, [cq+5*32+16]
+ packssdw m6, [cq+6*32+16]
+ packssdw m7, [cq+7*32+16]
+ mova [rsp+gprsize+16*1], m6
+ jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mova m8, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+ jmp m(idct_8x8_internal_16bpc).zero
+
+%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
+%else
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ mov r3d, 4
+%if stack_size_padded > 0
+ ; adjust to caller's stack allocation
+ add rsp, (12+ARCH_X86_64)*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, v
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%endif
+
+cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*64+r5]
+ mova m1, [cq+1*64+r5]
+ mova m2, [cq+2*64+r5]
+ mova m3, [cq+3*64+r5]
+ mova m4, [cq+4*64+r5]
+ mova m5, [cq+5*64+r5]
+ mova m6, [cq+6*64+r5]
+ mova m7, [cq+7*64+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call t0
+
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ sub r5d, 16
+ jge .loop_pass1
+%if WIN64
+ POP r7
+%elif ARCH_X86_32
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+
+ ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
+ ; some are still pre-loaded from the final loop iteration in pass=1
+
+ mova m1, m2
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+11*16]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+ 4*16]
+ mova m1, [cq+12*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+13*16]
+ mova m4, [cq+ 6*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 7*16]
+ mova m7, [cq+15*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, v
+
+cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+13*16]
+ mova [rsp+gprsize+7*16], m0
+ mova [rsp+gprsize+8*16], m1
+ mova [rsp+gprsize+5*16], m4
+ mova [rsp+gprsize+6*16], m5
+ mova m0, m2
+ mova m1, m3
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 5*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+ 6*16]
+ mova m6, [cq+11*16]
+ mova m7, [cq+15*16]
+ mova [rsp+gprsize+ 3*16], m4
+ mova [rsp+gprsize+ 4*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m4, [cq+10*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+ 7*16]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+%if ARCH_X86_64
+ mova m11, [o(pw_m2048)]
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, v
+
+cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_8x16_internal_16bpc).pass2
+
+INV_TXFM_8X16_FN identity, dct, h
+INV_TXFM_8X16_FN identity, adst, h
+INV_TXFM_8X16_FN identity, flipadst, h
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ call .main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
+ dec r5d
+ jle .end
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 4*16]
+ mova m2, [cq+ 8*16]
+ mova m3, [cq+12*16]
+ jmp .pass2_loop
+.end:
+ RET
+.main:
+ ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
+%if ARCH_X86_32
+ mova m7, [o(pw_1697x16)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+%else
+ pmulhrsw m8, m7, m0
+ pmulhrsw m9, m7, m1
+ pmulhrsw m10, m7, m2
+ pmulhrsw m11, m7, m3
+%endif
+ REPX {paddsw x, x}, m0, m1, m2, m3
+%if ARCH_X86_64
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m2, m10
+ paddsw m3, m11
+%else
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+%endif
+ ret
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 384
+ sar r5d, 9
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m3, [o(pixel_10bpc_max)]
+ pxor m4, m4
+.loop:
+ mova m1, [dstq+ 0]
+ mova m2, [dstq+16]
+ REPX {paddw x, m0}, m1, m2
+ REPX {pminsw x, m3}, m1, m2
+ REPX {pmaxsw x, m4}, m1, m2
+ mova [dstq+ 0], m1
+ mova [dstq+16], m2
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+
+ mova m0, [cq+ 1*16]
+ mova m1, [cq+ 3*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+ 7*16]
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+11*16]
+ mova m6, [cq+13*16]
+ mova m7, [cq+15*16]
+ call .main_oddhalf
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 2*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+ 6*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+14*16]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ ; t0-7 is in m0-7
+
+ call .round
+
+%if ARCH_X86_64
+.pack_transpose:
+ ; transpose in two parts
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+.transpose:
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call .transpose4x8packed_hi
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ jmp tx2q
+%if ARCH_X86_64
+.transpose4x8packed_hi:
+ punpcklwd m9, m10, m14
+ punpckhwd m10, m14
+ punpckhwd m14, m8, m12
+ punpcklwd m8, m12
+
+ punpckhwd m11, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m12, m14, m10
+ punpcklwd m14, m10
+
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ punpckhwd m9, m8, m14
+ punpcklwd m8, m14
+ ret
+%endif
+.main_oddhalf_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4076)]
+ pmulld m0, [o(pd_401)]
+ pmulld m6, m1, [o(pd_m1189)]
+ pmulld m1, [o(pd_3920)]
+%if ARCH_X86_32
+ mova m4, [o(pd_2048)]
+ REPX {paddd x, m4}, m1, m6
+ REPX {psrad x, 12}, m1, m6
+ mova [r3+1*16], m1
+%endif
+ pmulld m5, m2, [o(pd_3612)]
+ pmulld m2, [o(pd_1931)]
+%if ARCH_X86_32
+ pmulld m1, m3, [o(pd_m2598)]
+%else
+ pmulld m4, m3, [o(pd_m2598)]
+%endif
+ pmulld m3, [o(pd_3166)]
+ jmp .main_oddhalf_fast2
+.main_oddhalf:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m4, [o(pd_2048)]
+
+ ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
+
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova m1, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+
+ ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a
+
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+.main_oddhalf_fast2:
+ REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3
+ REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3
+ psubd m4, m0, m1 ; t9
+ paddd m0, m1 ; t8
+ mova m1, [r3+1*16]
+ mova [r3+0*16], m4
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784
+ ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ psubd m5, m1, m4 ; t10
+ mova [r3+1*16], m5
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m3, m6 ; t13
+ paddd m6, m3 ; t14
+ paddd m3, m7, m5 ; t15a
+ psubd m7, m5 ; t12a
+ mova [r3+0*16], m3
+ mova m3, [r3+1*16]
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pminsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(pd_2896)]
+ REPX {pmulld x, m5}, m2, m7, m3, m4
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m2, m7
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m7, m4 ; t11
+ paddd m4, m7 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova m7, [r3+0*16]
+ mova [r3+11*16], m0
+ mova [r3+10*16], m1
+ mova [r3+9*16], m2
+ mova [r3+8*16], m3
+ mova [r3+7*16], m4
+ mova [r3+6*16], m5
+ mova [r3+5*16], m6
+ mova [r3+4*16], m7
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ pcmpeqd m8, m8
+ REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ pcmpeqd m0, m0
+ REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+ 0*16]
+ psubd m1, m0
+ mova [r3+ 0*16], m1
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 1}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 1}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
+.pass2_loop:
+ lea r3, [strideq*3]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%if ARCH_X86_64
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+%else
+ mova m0, [rsp+gprsize+0*16]
+ mova m1, [rsp+gprsize+1*16]
+ mova m2, [rsp+gprsize+2*16]
+ mova m3, [rsp+gprsize+3*16]
+%endif
+ add dstq, 16
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ RET
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+ call .main
+%if ARCH_X86_64
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*16]
+ mova m1, [cq+13*16]
+ mova m2, [cq+ 6*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+10*16]
+ mova m5, [cq+ 5*16]
+ mova m6, [cq+14*16]
+ mova m7, [cq+ 1*16]
+ call .main_part1
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+15*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+11*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+ 7*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+ 3*16]
+ call .main_part2
+.round:
+%if ARCH_X86_64
+ mova m15, [o(pd_6144)]
+ psrld m14, 11 ; pd_1
+ pcmpeqd m8, m8 ; -1
+ psubd m13, m15, m14 ; pd_6143
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m15}, m4, m6
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {psrad x, 1 }, m1, m3
+ REPX {paddd x, m15}, m5, m7
+ REPX {psubd x, m8 }, m1, m3
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_6144)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 13}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {pxor x, m1}, m5, m7
+ REPX {psubd x, m1}, m4, m6
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 13}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ REPX {psubd x, m1}, m4, m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.main_part2:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15
+ mova m10, [r3+0*16] ; t2
+ mova m8, [r3+1*16] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ mova [r3+0*16], m1
+ mova m15, [r3+3*16] ; t7a
+ mova m1, [r3+2*16] ; t6a
+ psubd m8, m3, m15 ; t7
+ paddd m15, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ mova [r3+3*16], m15
+ mova [r3+1*16], m2
+ mova m1, [r3+7*16] ; t15
+ mova m2, [r3+6*16] ; t14
+ paddd m15, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ mova [r3+2*16], m15
+ mova m1, [r3+4*16] ; t10a
+ mova m15, [r3+5*16] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m15 ; t11
+ paddd m5, m15 ; out14
+ REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
+ pmaxsd m12, [r3+1*16] ; t6
+ mova [r3+1*16], m5
+ REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
+ REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+%else
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova [r3+8*16], m4
+ mova m4, [r3+10*16]
+ mova [r3+9*16], m5
+ mova [r3+10*16], m6
+ mova m5, [r3+11*16]
+ mova [r3+11*16], m7
+ ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091
+ ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703
+ mova m2, [r3+8*16]
+ mova m6, [r3+9*16]
+ psubd m3, m0, m2 ; t8a
+ paddd m0, m2 ; t0a
+ mova [r3+8*16], m3
+ psubd m2, m1, m6 ; t9a
+ paddd m1, m6 ; t1a
+ mova m3, [r3+10*16]
+ psubd m6, m4, m3 ; t12a
+ paddd m4, m3 ; t4a
+ mova m3, [r3+11*16]
+ psubd m7, m5, m3 ; t13a
+ paddd m5, m3 ; t5a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pmaxsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ psubd m3, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m1, m5 ; t5
+ paddd m1, m5 ; t1
+ mova m5, [o(pd_2048)]
+ mova [r3+9*16], m1
+ mova [r3+10*16], m4
+ mova [r3+11*16], m3
+ mova m3, [r3+8*16]
+ mova [r3+8*16], m0
+ ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017
+ ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4
+ psubd m5, m2, m7 ; t12a
+ paddd m2, m7 ; t8a
+ psubd m7, m3, m6 ; t13a
+ paddd m6, m3 ; t9a
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova m4, [r3+10*16]
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pmaxsd m3, [r3+11*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784
+ ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784
+ mova m0, [r3+7*16] ; t7a
+ mova m2, [r3+6*16] ; t6a
+ psubd m1, m3, m0 ; t7
+ paddd m0, m3 ; out12
+ paddd m3, m4, m2 ; -out3
+ psubd m4, m2 ; t6
+ mova [r3+7*16], m3
+ mova m3, [r3+3*16] ; t15
+ mova m2, [r3+2*16] ; t14
+ paddd m6, m5, m3 ; -out13
+ psubd m5, m3 ; t15a
+ psubd m3, m7, m2 ; t14a
+ paddd m2, m7 ; out2
+ mova [r3+6*16], m2
+ mova m7, [r3+0*16] ; t10a
+ mova m2, [r3+1*16] ; t11a
+ mova [r3+0*16], m0
+ mova [r3+1*16], m6
+ mova m6, [r3+11*16]
+ psubd m0, m6, m2 ; t11
+ paddd m6, m2 ; out14
+ mova [r3+2*16], m6
+ mova m2, [r3+10*16]
+ psubd m6, m2, m7 ; t10
+ paddd m2, m7 ; -out1
+ mova m7, [r3+5*16] ; t3
+ mova [r3+5*16], m2
+ mova [r3+10*16], m1
+ mova m1, [r3+9*16]
+ psubd m2, m1, m7 ; t3a
+ paddd m1, m7 ; -out15
+ mova [r3+3*16], m1
+ mova m1, [r3+4*16] ; t2
+ mova m7, [r3+8*16]
+ psubd m7, m1 ; t2a
+ paddd m1, [r3+8*16] ; out0
+ mova [r3+4*16], m1
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmaxsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pminsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(pd_2896)]
+ REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmulld m1, [r3+10*16]
+ mova [r3+11*16], m3
+ psubd m3, m4, m1 ; -out11 (unshifted)
+ paddd m4, m1 ; out4 (unshifted)
+ psubd m1, m6, m0 ; -out9 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ psubd m0, m7, m2 ; out8 (unshifted)
+ paddd m7, m2 ; -out7 (unshifted)
+ mova m2, [r3+11*16]
+ mova [r3+11*16], m5
+ paddd m5, m2 ; -out5 (unshifted)
+ psubd m2, [r3+11*16] ; out10 (unshifted)
+ ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
+ ; r[-4,3] contain out0-3 and out12-15
+%endif
+ ret
+.main_part1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_1567)]
+ mova m10, [o(pd_3784)]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [r3+4*16]
+ mova m1, [r3+5*16]
+ mova m2, [r3+6*16]
+ mova m7, [r3+7*16]
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973
+ ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ psubd m6, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ mova [r3+4*16], m6
+ mova m6, [r3+2*16]
+ mova m3, [r3+3*16]
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m7, m3 ; t15a
+ paddd m7, m3 ; t7a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pmaxsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pminsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ mova [r3+5*16], m1
+ mova [r3+6*16], m3
+ mova [r3+7*16], m2
+ mova m1, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276
+ ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m1, m5 ; t15a
+ paddd m5, m1 ; t11a
+ mova m1, [r3+5*16]
+ mova m3, [r3+6*16]
+ mova m2, [r3+7*16]
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pmaxsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pminsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567
+ ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r3, [rsp+gprsize]
+ call m(iadst_16x4_internal_16bpc).main
+%if ARCH_X86_64
+ packssdw m1, m0
+ packssdw m3, m2
+ packssdw m5, m4
+ packssdw m7, m6
+ packssdw m9, m8
+ packssdw m11, m10
+ packssdw m13, m12
+ packssdw m15, m14
+ mova m0, m15
+ mova m2, m13
+ mova m4, m11
+ mova m6, m9
+ mova m8, m7
+ mova m10, m5
+ mova m12, m3
+ mova m14, m1
+ jmp m(idct_16x4_internal_16bpc).transpose
+%else
+ mova [rsp+gprsize+4*16], m0
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m4
+ mova [rsp+gprsize+7*16], m6
+ pshufd m6, [rsp+gprsize+ 8*16], q1032
+ pshufd m4, [rsp+gprsize+ 9*16], q1032
+ pshufd m2, [rsp+gprsize+10*16], q1032
+ pshufd m0, [rsp+gprsize+11*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ pshufd m6, [rsp+gprsize+ 4*16], q1032
+ pshufd m4, [rsp+gprsize+ 5*16], q1032
+ pshufd m2, [rsp+gprsize+ 6*16], q1032
+ pshufd m0, [rsp+gprsize+ 7*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.pass2:
+ lea r3, [strideq*3]
+ lea dstq, [dstq+r3]
+ neg strideq
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*16]
+ pmulld m1, m15, [cq+ 1*16]
+ pmulld m2, m15, [cq+ 2*16]
+ pmulld m3, m15, [cq+ 3*16]
+ pmulld m4, m15, [cq+ 4*16]
+ pmulld m5, m15, [cq+ 5*16]
+ pmulld m6, m15, [cq+ 6*16]
+ pmulld m7, m15, [cq+ 7*16]
+ pmulld m8, m15, [cq+ 8*16]
+ pmulld m9, m15, [cq+ 9*16]
+ pmulld m10, m15, [cq+10*16]
+ pmulld m11, m15, [cq+11*16]
+ pmulld m12, m15, [cq+12*16]
+ pmulld m13, m15, [cq+13*16]
+ pmulld m14, m15, [cq+14*16]
+ pmulld m15, [cq+15*16]
+ mova [cq+ 0*16], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq+ 0*16]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ add cq, 8*16
+ mov r5d, 2
+.loop_pass1:
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+0*16]
+ pmulld m1, m7, [cq+1*16]
+ pmulld m2, m7, [cq+2*16]
+ pmulld m3, m7, [cq+3*16]
+ pmulld m4, m7, [cq+4*16]
+ pmulld m5, m7, [cq+5*16]
+ pmulld m6, m7, [cq+6*16]
+ pmulld m7, [cq+7*16]
+ mova [cq+7*16], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [cq+7*16]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ dec r5d
+ jz .end_pass1
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ sub cq, 8*16
+ jmp .loop_pass1
+.end_pass1:
+ jmp tx2q
+%endif
+
+.pass2:
+%if ARCH_X86_64
+ mova m12, [o(pw_1697x8)]
+%endif
+ lea r4, [o(.main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+.main:
+%if ARCH_X86_64
+ pmulhrsw m4, m0, m12
+ pmulhrsw m5, m1, m12
+ pmulhrsw m6, m2, m12
+ pmulhrsw m7, m3, m12
+%else
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m0, m7
+ pmulhrsw m5, m1, m7
+ pmulhrsw m6, m2, m7
+ pmulhrsw m7, m3
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+%if ARCH_X86_32
+ add rsp, 1*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, 6
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 4, 6
+%else
+ mov [rsp+gprsize+12*16], r1
+ DECLARE_REG_TMP 1, 4, 3
+%endif
+ lea t0, [o(.main)]
+.loop_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*32+r5], m8
+ mova [cq+5*32+r5], m9
+ mova [cq+6*32+r5], m10
+ mova [cq+7*32+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*32+r5], m0
+ mova [cq+5*32+r5], m1
+ mova [cq+6*32+r5], m2
+ mova [cq+7*32+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ pxor m7, m7
+ REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
+ test r5d, r5d
+ jz .end
+ mova [cq+0*32+r5], m0
+ mova [cq+1*32+r5], m1
+ mova [cq+2*32+r5], m2
+ mova [cq+3*32+r5], m3
+ xor r5d, r5d
+ jmp .loop_pass1
+.end:
+
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*32+r5]
+ mova m1, [cq+ 3*32+r5]
+ mova m2, [cq+ 5*32+r5]
+ mova m3, [cq+ 7*32+r5]
+ mova m4, [cq+ 9*32+r5]
+ mova m5, [cq+11*32+r5]
+ mova m6, [cq+13*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 2*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+ 6*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+10*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+14*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+.pass2_main:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity, 6
+
+cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*32+r5]
+ mova m1, [cq+13*32+r5]
+ mova m2, [cq+ 6*32+r5]
+ mova m3, [cq+ 9*32+r5]
+ mova m4, [cq+10*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+ 1*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+15*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+ 7*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+ 3*32+r5]
+%if ARCH_X86_32
+ add r3, 8*16
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 8*16
+%endif
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call m(iadst_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mova m11, [o(pw_m2048)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity, 6
+
+cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+ call m(iadst_16x8_internal_16bpc).main
+%if ARCH_X86_64
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ mova m14, m1
+ mova m12, m3
+ mova m10, m5
+ mova m8, m7
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+8*16], m7
+ mova [r3+9*16], m5
+ mova [r3+10*16], m3
+ mova [r3+11*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ lea dstq, [dstq+strideq*8]
+ neg strideq
+ add dstq, strideq
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], strideq
+%endif
+ jmp m(iadst_16x8_internal_16bpc).pass2
+
+INV_TXFM_16X8_FN identity, dct, -54
+INV_TXFM_16X8_FN identity, adst, -54
+INV_TXFM_16X8_FN identity, flipadst, -54
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_2896)]
+ pmulld m0, m15, [cq+ 0*32+r5]
+ pmulld m1, m15, [cq+ 1*32+r5]
+ pmulld m2, m15, [cq+ 2*32+r5]
+ pmulld m3, m15, [cq+ 3*32+r5]
+ pmulld m4, m15, [cq+ 4*32+r5]
+ pmulld m5, m15, [cq+ 5*32+r5]
+ pmulld m6, m15, [cq+ 6*32+r5]
+ pmulld m7, m15, [cq+ 7*32+r5]
+ pmulld m8, m15, [cq+ 8*32+r5]
+ pmulld m9, m15, [cq+ 9*32+r5]
+ pmulld m10, m15, [cq+10*32+r5]
+ pmulld m11, m15, [cq+11*32+r5]
+ pmulld m12, m15, [cq+12*32+r5]
+ pmulld m13, m15, [cq+13*32+r5]
+ pmulld m14, m15, [cq+14*32+r5]
+ pmulld m15, [cq+15*32+r5]
+ mova [r3], m15
+ mova m15, [o(pd_2048)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [r3], m15
+ mova m15, [o(pd_11586)]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [r3]
+ mova [r3], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 1*32+r5]
+ mova m2, [cq+ 2*32+r5]
+ mova m3, [cq+ 3*32+r5]
+ mova m4, [cq+ 4*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+ 6*32+r5]
+ mova m7, [cq+ 7*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m0, [cq+ 8*32+r5]
+ mova m1, [cq+ 9*32+r5]
+ mova m2, [cq+10*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+12*32+r5]
+ mova m5, [cq+13*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_4096)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_64
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
+%else
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (5+ARCH_X86_64*3+WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, v
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 7
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+%elif ARCH_X86_32
+ DECLARE_REG_TMP 1, 6
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+.pass1_full:
+%undef cmp
+ mov t1d, 4
+.zero_loop:
+ dec t1d
+ cmp eobb, byte [r5+t1]
+ jb .zero_loop
+ mov r5d, t1d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*64+r5], m8
+ mova [cq+5*64+r5], m9
+ mova [cq+6*64+r5], m10
+ mova [cq+7*64+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*64+r5], m0
+ mova [cq+5*64+r5], m1
+ mova [cq+6*64+r5], m2
+ mova [cq+7*64+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 16
+ jge .loop_pass1
+
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mova m0, [cq+ 1*64+r5]
+ mova m1, [cq+ 3*64+r5]
+ mova m2, [cq+ 5*64+r5]
+ mova m3, [cq+ 7*64+r5]
+ mova m4, [cq+ 9*64+r5]
+ mova m5, [cq+11*64+r5]
+ mova m6, [cq+13*64+r5]
+ mova m7, [cq+15*64+r5]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+ 2*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+ 6*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+10*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+14*64+r5]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ psrld m8, m11, 10 ; 2
+ REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova m0, [o(pd_2)]
+ REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ paddd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 2}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 2}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+ 0]
+ mova m1, [cq+2*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+2*64+16]
+ mova m4, [cq+0*64+32]
+ mova m5, [cq+2*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+2*64+48]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+1*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+1*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+1*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+1*64+48]
+ mova m7, [cq+3*64+48]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*64+r5]
+ mova m1, [cq+13*64+r5]
+ mova m2, [cq+ 6*64+r5]
+ mova m3, [cq+ 9*64+r5]
+ mova m4, [cq+10*64+r5]
+ mova m5, [cq+ 5*64+r5]
+ mova m6, [cq+14*64+r5]
+ mova m7, [cq+ 1*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+15*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+11*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+ 7*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+ 3*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ mova m15, [o(pd_10240)]
+ psrld m14, 10 ; +2
+ psubd m13, m14, m8 ; +3
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m13}, m1, m3
+ REPX {paddd x, m15}, m4, m5, m6, m7
+ paddd m13, m15, m8 ; +10239
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_10240)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 14}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ mova m3, [o(pd_10240)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 14}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m11, [o(pw_m2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+32]
+ mova m1, [cq+1*64+32]
+ mova m2, [cq+2*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+0*64+ 0]
+ mova m5, [cq+1*64+ 0]
+ mova m6, [cq+2*64+48]
+ mova m7, [cq+3*64+48]
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [cq+2*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+1*64+16]
+ mova m4, [cq+2*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+1*64+48]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+ call m(iadst_16x16_internal_16bpc).main
+%if ARCH_X86_64
+ mova m1, m0
+ mova m3, m2
+ mova m5, m4
+ mova m7, m6
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ pshufd m8, m7, q1032
+ pshufd m10, m5, q1032
+ pshufd m12, m3, q1032
+ pshufd m14, m1, q1032
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+11*16], m1
+ mova [r3+10*16], m3
+ mova [r3+ 9*16], m5
+ mova [r3+ 8*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_16x16_internal_16bpc).pass2
+
+INV_TXFM_16X16_FN identity, dct, h
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*64+r5]
+ pmulld m1, m15, [cq+ 1*64+r5]
+ pmulld m2, m15, [cq+ 2*64+r5]
+ pmulld m3, m15, [cq+ 3*64+r5]
+ pmulld m4, m15, [cq+ 4*64+r5]
+ pmulld m5, m15, [cq+ 5*64+r5]
+ pmulld m6, m15, [cq+ 6*64+r5]
+ pmulld m7, m15, [cq+ 7*64+r5]
+ pmulld m8, m15, [cq+ 8*64+r5]
+ pmulld m9, m15, [cq+ 9*64+r5]
+ pmulld m10, m15, [cq+10*64+r5]
+ pmulld m11, m15, [cq+11*64+r5]
+ pmulld m12, m15, [cq+12*64+r5]
+ pmulld m13, m15, [cq+13*64+r5]
+ pmulld m14, m15, [cq+14*64+r5]
+ pmulld m15, [cq+15*64+r5]
+ mova [r3], m15
+ mova m15, [o(pd_10240)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 0*64+r5]
+ pmulld m1, m7, [cq+ 1*64+r5]
+ pmulld m2, m7, [cq+ 2*64+r5]
+ pmulld m3, m7, [cq+ 3*64+r5]
+ pmulld m4, m7, [cq+ 4*64+r5]
+ pmulld m5, m7, [cq+ 5*64+r5]
+ pmulld m6, m7, [cq+ 6*64+r5]
+ pmulld m7, [cq+ 7*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+8*16], m0
+ mova [r3+9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 8*64+r5]
+ pmulld m1, m7, [cq+ 9*64+r5]
+ pmulld m2, m7, [cq+10*64+r5]
+ pmulld m3, m7, [cq+11*64+r5]
+ pmulld m4, m7, [cq+12*64+r5]
+ pmulld m5, m7, [cq+13*64+r5]
+ pmulld m6, m7, [cq+14*64+r5]
+ pmulld m7, [cq+15*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ mova m0, [cq+0*64+0]
+ mova m1, [cq+1*64+0]
+ mova m2, [cq+2*64+0]
+ mova m3, [cq+3*64+0]
+ call m(iidentity_8x16_internal_16bpc).main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ dec r5w
+ jg .pass2_loop
+ add cq, 64*3
+ btc r5d, 16
+ jc .end
+%if ARCH_X86_64
+ lea dstq, [r7+16]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ add dstq, 16
+%endif
+ add r5d, 4
+ jmp .pass2_loop
+.end:
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_5)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r5d, eobd
+ add eobb, 21
+ cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192
+ lea r4, [strideq*3]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ btc eobd, 16
+ jnc .loop
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r4 ]
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r4 ], m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_4096)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r4d, eobd
+ add eobb, 21
+ cmovc eobd, r4d
+ lea r4, [strideq*3]
+ mov r5, dstq
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ mova m2, [cq+32*4]
+ packssdw m2, [cq+32*5]
+ mova m3, [cq+32*6]
+ packssdw m3, [cq+32*7]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .loop
+ add cq, 32*8-32
+ add r5, 16
+ mov dstq, r5
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m11, [o(pw_8192)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+%if ARCH_X86_64
+ paddw m10, m11, m11 ; pw_16384
+%endif
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m0
+ pmulhrsw m5, m1
+ mova m6, [o(pw_16384)]
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m2
+ pmulhrsw m5, m3
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m2, m4
+ paddsw m3, m5
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+%else
+ psrlw m6, 1 ; pw_8192
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m10, [o(pw_2048)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+%endif
+ REPX {paddsw x, x }, m0, m1, m2, m3
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+%else
+ mova m6, [o(pw_1697x16)]
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+%endif
+ REPX {paddsw x, x }, m0, m1
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+%else
+ pmulhrsw m4, m6, m2
+ pmulhrsw m6, m3
+%endif
+ REPX {paddsw x, x }, m2, m3
+ paddsw m2, m4
+%if ARCH_X86_64
+ paddsw m3, m5
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+%else
+ paddsw m3, m6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%undef cmp
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_8192)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ lea r4, [strideq*3]
+ mov r5, dstq
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8-32 ; 0 1
+ lea dstq, [r5+16] ; 1
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2
+ lea dstq, [r5+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ add r5, 16*3 ; 1 2 3
+ mov dstq, r5 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4 5
+ call .main ; 3 4 5
+ call .main2
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8-32 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8
+ sub dstq, 16
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%define base $$
+ DECLARE_REG_TMP 0, 4
+%else
+ lea r6, [tbl_Nx32_odd_offset]
+%define base tbl_Nx32_odd_offset
+ DECLARE_REG_TMP 4, 7
+%if WIN64
+ mov [rsp+gprsize*1+35*16], r7
+%endif
+%endif
+%define o2(x) r6-base+x
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+35*16], r0
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+ 3*16+r5*8], m0
+ mova [rsp+11*16+r5*8], m0
+ mova [rsp+ 3*16+t0*8], m0
+ mova [rsp+ 3*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+35*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*128+r5*8]
+ mova m1, [cq+1*128+r5*8]
+ mova m2, [cq+2*128+r5*8]
+ mova m3, [cq+3*128+r5*8]
+ mova m4, [cq+4*128+r5*8]
+ mova m5, [cq+5*128+r5*8]
+ mova m6, [cq+6*128+r5*8]
+ mova m7, [cq+7*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ mova m1, [o(pd_2)]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [r3+ 3*16+r5*8], m0
+ mova [r3+11*16+r5*8], m2
+ mova [r3+ 3*16+t1*8], m1
+ mova [r3+ 3*16+t0*8], m3
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass 2 code starts here
+ ; m0 is already loaded from last iteration of first pass
+%if ARCH_X86_32
+ mov r0, [rsp+gprsize*1+35*16]
+%endif
+ mov eobd, [rsp+gprsize*0+35*16]
+ cmp eobd, 43
+ jl .load_veryfast
+ cmp eobd, 107
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+ call .pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+35*16]
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ mova m4, [rsp+gprsize+16* 7]
+ mova m5, [rsp+gprsize+16* 8]
+ mova m6, [rsp+gprsize+16* 9]
+ mova m7, [rsp+gprsize+16*10]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+ call r4
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+19*16]
+ mova m1, [rsp+gprsize+20*16]
+ mova m2, [rsp+gprsize+21*16]
+ mova m3, [rsp+gprsize+22*16]
+ mova m4, [rsp+gprsize+23*16]
+ mova m5, [rsp+gprsize+24*16]
+ mova m6, [rsp+gprsize+25*16]
+ mova m7, [rsp+gprsize+26*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+27*16]
+ mova m1, [rsp+gprsize+28*16]
+ mova m2, [rsp+gprsize+29*16]
+ mova m3, [rsp+gprsize+30*16]
+ mova m4, [rsp+gprsize+31*16]
+ mova m5, [rsp+gprsize+32*16]
+ mova m6, [rsp+gprsize+33*16]
+ mova m7, [rsp+gprsize+34*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ ret
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (31+2*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
+
+cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+76*16], r0
+%elif WIN64
+ mov [rsp+gprsize*1+76*16], r7
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m0
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m0
+ mova [rsp+44*16+t0*8], m0
+ mova [rsp+44*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+76*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+44*16+r5*8], m8
+ mova [rsp+52*16+r5*8], m10
+ mova [rsp+44*16+t1*8], m9
+ mova [rsp+44*16+t0*8], m11
+%else
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m2
+ mova [rsp+44*16+t1*8], m1
+ mova [rsp+44*16+t0*8], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ add rsp, 9*16
+%if ARCH_X86_64
+ mov r6, dstq
+%else
+ mov dstq, [rsp+gprsize*1+67*16]
+%endif
+ mov eobd, [rsp+gprsize*0+67*16]
+ cmp eobd, 44
+ jl .load_veryfast
+ cmp eobd, 151
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+67*16]
+ mov dword [r2+0*gprsize], 2
+%endif
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [rsp+16* 3]
+.loop_pass2_entry:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
+ add rsp, 32*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+3*16]
+%endif
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-73*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-73*16)
+%assign stack_offset (stack_offset-73*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+ RET
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add rsp, (65+4*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+
+cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ add r5d, r5d
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+ mova m0, [cq+32* 1+r5*8]
+ mova m1, [cq+32* 7+r5*8]
+ mova m2, [cq+32* 9+r5*8]
+ mova m3, [cq+32*15+r5*8]
+ mova m4, [cq+32*17+r5*8]
+ mova m5, [cq+32*23+r5*8]
+ mova m6, [cq+32*25+r5*8]
+ mova m7, [cq+32*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call .main_oddhalf_part1
+ mova m0, [cq+32* 3+r5*8]
+ mova m1, [cq+32* 5+r5*8]
+ mova m2, [cq+32*11+r5*8]
+ mova m3, [cq+32*13+r5*8]
+ mova m4, [cq+32*19+r5*8]
+ mova m5, [cq+32*21+r5*8]
+ mova m6, [cq+32*27+r5*8]
+ mova m7, [cq+32*29+r5*8]
+ call .main_oddhalf_part2
+ mova m0, [cq+32* 2+r5*8]
+ mova m1, [cq+32* 6+r5*8]
+ mova m2, [cq+32*10+r5*8]
+ mova m3, [cq+32*14+r5*8]
+ mova m4, [cq+32*18+r5*8]
+ mova m5, [cq+32*22+r5*8]
+ mova m6, [cq+32*26+r5*8]
+ mova m7, [cq+32*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+32* 0+r5*8]
+ mova m1, [cq+32* 4+r5*8]
+ mova m2, [cq+32* 8+r5*8]
+ mova m3, [cq+32*12+r5*8]
+ mova m4, [cq+32*16+r5*8]
+ mova m5, [cq+32*20+r5*8]
+ mova m6, [cq+32*24+r5*8]
+ mova m7, [cq+32*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 8+r5*8], m8
+ mova [cq+32* 9+r5*8], m9
+ mova [cq+32*10+r5*8], m10
+ mova [cq+32*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 4+r5*8], m8
+ mova [cq+32* 5+r5*8], m9
+ mova [cq+32* 6+r5*8], m10
+ mova [cq+32* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32*12+r5*8], m8
+ mova [cq+32*13+r5*8], m9
+ mova [cq+32*14+r5*8], m10
+ mova [cq+32*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 4+r5*8], m0
+ mova [cq+32* 5+r5*8], m1
+ mova [cq+32* 6+r5*8], m2
+ mova [cq+32* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 8+r5*8], m0
+ mova [cq+32* 9+r5*8], m1
+ mova [cq+32*10+r5*8], m2
+ mova [cq+32*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32*12+r5*8], m0
+ mova [cq+32*13+r5*8], m1
+ mova [cq+32*14+r5*8], m2
+ mova [cq+32*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32* 0+r5*8], m0
+ mova [cq+32* 1+r5*8], m1
+ mova [cq+32* 2+r5*8], m2
+ mova [cq+32* 3+r5*8], m3
+ sub r5d, 2
+ jmp .loop_pass1
+.end_pass1:
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ mov r4d, 4
+ call m(idct_16x8_internal_16bpc).pass2_main
+ RET
+
+.main_oddhalf_part1_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4091)]
+ pmulld m0, [o(pd_201)]
+ pmulld m4, m3, [o(pd_m2751)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_3035)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+3*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_3035)]
+%endif
+ pmulld m6, m1, [o(pd_m1380)]
+ pmulld m1, [o(pd_3857)]
+ pmulld m5, m2, [o(pd_3703)]
+ pmulld m2, [o(pd_1751)]
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+ mova [r3+16*6], m8
+ mova [r3+16*7], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
+ mova m4, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m2, [r3+0*16]
+ mova m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m6, [r3+2*16]
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7
+ REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7
+ psubd m3, m0, m4 ; t17
+ mova [r3+0*16], m3
+ mova m3, [r3+3*16]
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m3, m7 ; t30
+ paddd m7, m3 ; t31
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m3, m2 ; t29
+ paddd m3, m2 ; t30
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova [r3+0*16], m3
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m2 ; t28a
+ paddd m7, m2 ; t31a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+0*16]
+ mova [r3+0*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*6], m2
+ mova [r3+16*7], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+%endif
+ ret
+.main_oddhalf_part2_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_m601)]
+ pmulld m0, [o(pd_4052)]
+ pmulld m4, m3, [o(pd_3290)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_2440)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+11*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_2440)]
+%endif
+ pmulld m6, m1, [o(pd_3973)]
+ pmulld m1, [o(pd_995)]
+ pmulld m5, m2, [o(pd_m2106)]
+ pmulld m2, [o(pd_3513)]
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
+ mova m9, [r3+16*0] ; t16a
+ mova m10, [r3+16*1] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r3+16*0], m9
+ mova [r3+16*1], m10
+ mova m9, [r3+16*2] ; t18a
+ mova m10, [r3+16*3] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r3+16*2], m9
+ mova [r3+16*3], m10
+ mova m9, [r3+16*4] ; t28
+ mova m10, [r3+16*5] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r3+16*4], m4
+ mova [r3+16*5], m1
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m7
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+ mova [r3+16*12], m9
+ mova [r3+16*13], m10
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+%else
+ mova [r3+ 8*16], m2
+ mova [r3+ 9*16], m3
+ mova [r3+10*16], m4
+ mova [r3+11*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a
+ mova m2, [r3+ 8*16]
+ mova m4, [r3+10*16]
+ mova m5, [r3+11*16]
+ mova [r3+ 8*16], m0
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [r3+ 9*16]
+ mova [r3+ 9*16], m1
+ ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
+ mova m0, [r3+ 8*16]
+ mova m1, [r3+ 9*16]
+ mova m6, [r3+10*16]
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6
+ REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6
+ psubd m3, m0, m4 ; t25
+ mova [r3+ 8*16], m3
+ mova m3, [r3+11*16]
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m3, m7 ; t22
+ paddd m7, m3 ; t23
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m1
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m3, m2 ; t26
+ paddd m3, m2 ; t25
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m6, [r3+10*16]
+ mova m7, [r3+11*16]
+ mova [r3+ 8*16], m3
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m2 ; t20a
+ paddd m7, m2 ; t23a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+14*16], m5
+ mova [r3+15*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20
+ mova [r3+10*16], m3
+ mova m0, [o(clip_18b_min)]
+ mova m2, [o(clip_18b_max)]
+ mova m5, [r3+16*2] ; t18a
+ mova m7, [r3+16*3] ; t19
+ psubd m3, m5, m1 ; t21
+ paddd m5, m1 ; t18
+ psubd m1, m7, m6 ; t20a
+ paddd m7, m6 ; t19a
+ REPX {pmaxsd x, m0}, m5, m7, m3, m1
+ REPX {pminsd x, m2}, m5, m7, m3, m1
+ mova [r3+16*2], m5
+ mova [r3+16*3], m7
+ mova [r3+11*16], m3
+ mova m3, [r3+10*16]
+ mova m5, [r3+16*4] ; t28
+ mova m7, [r3+16*5] ; t29a
+ psubd m6, m5, m3 ; t27a
+ paddd m5, m3 ; t28a
+ psubd m3, m7, m4 ; t26
+ paddd m7, m4 ; t29
+ REPX {pmaxsd x, m0}, m5, m7, m6, m3
+ REPX {pminsd x, m2}, m5, m7, m6, m3
+ mova [r3+16*12], m5
+ mova [r3+16*13], m7
+ mova m5, [o(pd_2048)]
+ mova m7, [o(pd_2896)]
+ mova m4, [r3+11*16]
+ REPX {pmulld x, m7}, m6, m3, m1, m4
+ paddd m6, m5
+ paddd m3, m5
+ psubd m5, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m4 ; t21a
+ paddd m3, m4 ; t26a
+ REPX {psrad x, 12}, m5, m1, m3, m6
+ mova [r3+16*4], m5
+ mova [r3+16*5], m1
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+
+ mova m5, [r3+14*16]
+ mova m6, [r3+15*16]
+ mova m3, [r3+16*0] ; t16a
+ mova m4, [r3+16*1] ; t17
+ psubd m1, m3, m6 ; t23
+ paddd m3, m6 ; t16
+ psubd m6, m4, m5 ; t22a
+ paddd m4, m5 ; t17a
+ REPX {pmaxsd x, m0}, m3, m4, m1, m6
+ REPX {pminsd x, m2}, m3, m4, m1, m6
+ mova [r3+16*0], m3
+ mova [r3+16*1], m4
+ mova m5, [r3+ 8*16]
+ mova m3, [r3+ 9*16]
+ mova [r3+ 8*16], m1
+ mova [r3+ 9*16], m6
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m6, m1, m5 ; t24
+ paddd m1, m5 ; t31
+ psubd m5, m4, m3 ; t25a
+ paddd m4, m3 ; t30a
+ REPX {pmaxsd x, m0}, m6, m5, m4, m1
+ REPX {pminsd x, m2}, m6, m5, m4, m1
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+ mova m4, [o(pd_2048)]
+ mova m1, [r3+ 9*16]
+ mova m2, [r3+ 8*16]
+ REPX {pmulld x, m7}, m5, m6, m1, m2
+ paddd m5, m4
+ paddd m6, m4
+ psubd m0, m5, m1 ; t22
+ paddd m5, m1 ; t25
+ psubd m1, m6, m2 ; t23a
+ paddd m2, m6 ; t24a
+ REPX {psrad x, 12}, m0, m1, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m1
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+%endif
+ ret
+
+ ; final sumsub for idct16 as well as idct32, plus final downshift
+%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
+ mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ mova m%5, [r3+16*( 0+%1)]
+ mova m%2, [r3+16*(15-%1)]
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%endmacro
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ mova m1, [o(pd_2)]
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ paddd m0, m1
+ paddd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 2}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq+16*0]
+ mova m2, [dstq+16*1]
+ mova m3, [dstq+16*2]
+ mova m4, [dstq+16*3]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64* 7+r5*8]
+ mova m2, [cq+64* 9+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ mova m4, [cq+64*17+r5*8]
+ mova m5, [cq+64*23+r5*8]
+ mova m6, [cq+64*25+r5*8]
+ mova m7, [cq+64*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64* 5+r5*8]
+ mova m2, [cq+64*11+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ mova m4, [cq+64*19+r5*8]
+ mova m5, [cq+64*21+r5*8]
+ mova m6, [cq+64*27+r5*8]
+ mova m7, [cq+64*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64* 6+r5*8]
+ mova m2, [cq+64*10+r5*8]
+ mova m3, [cq+64*14+r5*8]
+ mova m4, [cq+64*18+r5*8]
+ mova m5, [cq+64*22+r5*8]
+ mova m6, [cq+64*26+r5*8]
+ mova m7, [cq+64*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 4+r5*8]
+ mova m2, [cq+64* 8+r5*8]
+ mova m3, [cq+64*12+r5*8]
+ mova m4, [cq+64*16+r5*8]
+ mova m5, [cq+64*20+r5*8]
+ mova m6, [cq+64*24+r5*8]
+ mova m7, [cq+64*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 8+r5*8], m8
+ mova [cq+64* 9+r5*8], m9
+ mova [cq+64*10+r5*8], m10
+ mova [cq+64*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 4+r5*8], m8
+ mova [cq+64* 5+r5*8], m9
+ mova [cq+64* 6+r5*8], m10
+ mova [cq+64* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64*12+r5*8], m8
+ mova [cq+64*13+r5*8], m9
+ mova [cq+64*14+r5*8], m10
+ mova [cq+64*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 4+r5*8], m0
+ mova [cq+64* 5+r5*8], m1
+ mova [cq+64* 6+r5*8], m2
+ mova [cq+64* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 8+r5*8], m0
+ mova [cq+64* 9+r5*8], m1
+ mova [cq+64*10+r5*8], m2
+ mova [cq+64*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64*12+r5*8], m0
+ mova [cq+64*13+r5*8], m1
+ mova [cq+64*14+r5*8], m2
+ mova [cq+64*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [cq+64* 0+r5*8], m0
+ mova [cq+64* 1+r5*8], m1
+ mova [cq+64* 2+r5*8], m2
+ mova [cq+64* 3+r5*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 4
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ pcmpeqd m1, m1 ; -1
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ psubd m0, m1
+ psubd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 1}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%if ARCH_X86_32
+ mov [rsp+5*32*16+1*gprsize], dstq
+%elif WIN64
+ mov [rsp+5*32*16+1*gprsize], r7
+%endif
+%undef cmp
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m0
+ mova [rsp+32*16+t0*8+0*32*16], m0
+ mova [rsp+32*16+t1*8+0*32*16], m0
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m0
+ mova [rsp+32*16+t0*8+1*32*16], m0
+ mova [rsp+32*16+t1*8+1*32*16], m0
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m0
+ mova [rsp+32*16+t0*8+2*32*16], m0
+ mova [rsp+32*16+t1*8+2*32*16], m0
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m0
+ mova [rsp+32*16+t0*8+3*32*16], m0
+ mova [rsp+32*16+t1*8+3*32*16], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+5*32*16], eobd
+.loop_pass1:
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+2*32*16], m8
+ mova [rsp+40*16+r5*8+2*32*16], m10
+ mova [rsp+32*16+t1*8+2*32*16], m9
+ mova [rsp+32*16+t0*8+2*32*16], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+1*32*16], m8
+ mova [rsp+40*16+r5*8+1*32*16], m10
+ mova [rsp+32*16+t1*8+1*32*16], m9
+ mova [rsp+32*16+t0*8+1*32*16], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+3*32*16], m8
+ mova [rsp+40*16+r5*8+3*32*16], m10
+ mova [rsp+32*16+t1*8+3*32*16], m9
+ mova [rsp+32*16+t0*8+3*32*16], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m2
+ mova [rsp+32*16+t1*8+1*32*16], m1
+ mova [rsp+32*16+t0*8+1*32*16], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m2
+ mova [rsp+32*16+t1*8+2*32*16], m1
+ mova [rsp+32*16+t0*8+2*32*16], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m2
+ mova [rsp+32*16+t1*8+3*32*16], m1
+ mova [rsp+32*16+t0*8+3*32*16], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m2
+ mova [rsp+32*16+t1*8+0*32*16], m1
+ mova [rsp+32*16+t0*8+0*32*16], m3
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+5*32*16]
+ add rsp, 29*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(4*32+3)*16]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
+
+cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
+ 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*2+12)*16], r0
+ mov [rsp+gprsize*2+(64*2+12)*16], r1
+ mov [rsp+gprsize*3+(64*2+12)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*2+12)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*2+12)*16], r7
+ mov [rsp+gprsize*3+(64*2+12)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+12*16+t2*8], m0
+ mova [rsp+12*16+t3*8], m0
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m0
+ mova [rsp+76*16+t2*8], m0
+ mova [rsp+76*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*2+12)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*2+12)*16]
+ mov [rsp+gprsize*3+(64*2+12)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x16_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+76*16+t0*8], m8
+ mova [rsp+76*16+t1*8], m9
+ mova [rsp+76*16+t2*8], m10
+ mova [rsp+76*16+t3*8], m11
+%else
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m1
+ mova [rsp+76*16+t2*8], m2
+ mova [rsp+76*16+t3*8], m3
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m4, [rsp+10*16]
+ mova m6, [rsp+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t2*8], m2
+ mova [rsp+12*16+t3*8], m3
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*2+12)*16]
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*2+12)*16]
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 9*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+(64*2+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 2
+%endif
+.loop_pass2:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call .pass2
+ add rsp, 64*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-(64*2+9)*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
+%assign stack_offset (stack_offset-(64*2+9)*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+%if ARCH_X86_64
+ mov r9, [rsp+gprsize*1+3*16]
+%if WIN64
+ mov r7, [rsp+gprsize*2+3*16]
+ mov r8, [rsp+gprsize*3+3*16]
+%endif
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [rsp+gprsize+16* 3]
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+%if ARCH_X86_64
+ call r8
+%else
+ call [r2+4*gprsize]
+%endif
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+10*16], m7
+%if ARCH_X86_64
+ call r9
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%else
+ call [r2+5*gprsize]
+%endif
+ lea r3, [strideq*3]
+ lea r4, [rsp+gprsize+ 3*16]
+%if ARCH_X86_64
+ mov r6d, 8
+%else
+ mov dword [r2+2*gprsize], 8
+%endif
+.loop_write:
+ mova m0, [r4+0*16]
+ mova m1, [r4+1*16]
+ mova m2, [r4+2*16]
+ mova m3, [r4+3*16]
+ mova m4, [r4+4*16]
+ mova m5, [r4+5*16]
+ mova m6, [r4+6*16]
+ mova m7, [r4+7*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ add r4, 8*16
+%if ARCH_X86_64
+ dec r6d
+%else
+ dec dword [r2+2*gprsize]
+%endif
+ jg .loop_write
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
+ 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*4+32)*16], r0
+ mov [rsp+gprsize*2+(64*4+32)*16], r1
+ mov [rsp+gprsize*3+(64*4+32)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*4+32)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*4+32)*16], r7
+ mov [rsp+gprsize*3+(64*4+32)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m0
+ mova [rsp+ 32*16+t2*8], m0
+ mova [rsp+ 32*16+t3*8], m0
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m0
+ mova [rsp+ 96*16+t2*8], m0
+ mova [rsp+ 96*16+t3*8], m0
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m0
+ mova [rsp+160*16+t2*8], m0
+ mova [rsp+160*16+t3*8], m0
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m0
+ mova [rsp+224*16+t2*8], m0
+ mova [rsp+224*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*4+32)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*4+32)*16]
+ mov [rsp+gprsize*3+(64*4+32)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
+
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+160*16+t0*8], m8
+ mova [rsp+160*16+t1*8], m9
+ mova [rsp+160*16+t2*8], m10
+ mova [rsp+160*16+t3*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+ 96*16+t0*8], m8
+ mova [rsp+ 96*16+t1*8], m9
+ mova [rsp+ 96*16+t2*8], m10
+ mova [rsp+ 96*16+t3*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+224*16+t0*8], m8
+ mova [rsp+224*16+t1*8], m9
+ mova [rsp+224*16+t2*8], m10
+ mova [rsp+224*16+t3*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m1
+ mova [rsp+ 96*16+t2*8], m2
+ mova [rsp+ 96*16+t3*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m1
+ mova [rsp+160*16+t2*8], m2
+ mova [rsp+160*16+t3*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m1
+ mova [rsp+224*16+t2*8], m2
+ mova [rsp+224*16+t3*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m1
+ mova [rsp+ 32*16+t2*8], m2
+ mova [rsp+ 32*16+t3*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*4+32)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*4+32)*16]
+ cmp eobd, 136
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 29*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(64*4+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64*31+r5*8]
+ mova m2, [cq+64*17+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 7+r5*8]
+ mova m1, [cq+64*25+r5*8]
+ mova m2, [cq+64*23+r5*8]
+ mova m3, [cq+64* 9+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 5+r5*8]
+ mova m1, [cq+64*27+r5*8]
+ mova m2, [cq+64*21+r5*8]
+ mova m3, [cq+64*11+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64*29+r5*8]
+ mova m2, [cq+64*19+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ call .main_part1
+ call .main_part2
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64*14+r5*8]
+ mova m2, [cq+64*18+r5*8]
+ mova m3, [cq+64*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+64* 6+r5*8]
+ mova m1, [cq+64*10+r5*8]
+ mova m2, [cq+64*22+r5*8]
+ mova m3, [cq+64*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 4+r5*8]
+ mova m1, [cq+64*12+r5*8]
+ mova m2, [cq+64*20+r5*8]
+ mova m3, [cq+64*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 8+r5*8]
+ mova m2, [cq+64*16+r5*8]
+ mova m3, [cq+64*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call .main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea r4, [cq+r5*8+64*28]
+ call .shift_transpose
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 8
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+%if ARCH_X86_64
+ movd m7, [r4+4*0]
+ movd m8, [r4+4*1]
+ movd m6, [r4+4*2]
+ movd m9, [r4+4*3]
+ movd m5, [r4+4*4]
+ movd m10, [r4+4*5]
+ movd m4, [r4+4*6]
+ movd m15, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ movd m10, [r4+4*8]
+ movd m15, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m10, m15
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ movd m10, [r4+4*10]
+ movd m15, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m10, m15
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r4, 4*12
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m8
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+%else
+ movd m7, [r4+4*0]
+ movd m6, [r4+4*2]
+ movd m5, [r4+4*4]
+ movd m4, [r4+4*6]
+ REPX {pshufd x, x, q0000}, m7, m6, m5, m4
+ pmulld m7, m0 ; t63a
+ pmulld m6, m1 ; t62a
+ pmulld m5, m2 ; t61a
+ pmulld m4, m3 ; t60a
+ mova [r3+0*16], m6
+ mova [r3+1*16], m7
+ movd m6, [r4+4*1]
+ movd m7, [r4+4*3]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m0, m6 ; t32a
+ pmulld m1, m7 ; t33a
+ movd m6, [r4+4*5]
+ movd m7, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m2, m6 ; t34a
+ pmulld m3, m7 ; t35a
+ mova m6, [r3+0*16]
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3+1*16]
+ REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
+ mova [r3+0*16], m5
+ psubd m5, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ mova [r3+1*16], m0
+ mova m0, [r3+0*16]
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m0 ; t61
+ paddd m4, m0 ; t60
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pmaxsd m0, [r3+1*16]
+ mova [r3+0*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pminsd m0, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ movd m3, [r4+4*8]
+ movd m4, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m3, m4
+ mova [r3+4*16], m2
+ ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a
+ mova m2, [r3+4*16]
+ mova [r3+4*16], m5
+ ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
+ mova m0, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ mova [r3+0*16], m5
+ mova m5, [r3+4*16]
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m5, m2 ; t61
+ paddd m2, m5 ; t62
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pminsd m5, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m2
+ mova [r3+16*2], m4
+ mova m7, [o(pd_2048)]
+ movd m0, [r4+4*10]
+ movd m1, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m0, m1
+ ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+ mova m4, [r3+2*16]
+ ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
+ add r4, 4*12
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+%endif
+ add r3, 16*8
+ ret
+
+.main_part2: ; idct64 steps 6-9
+ lea r4, [r3+16*7]
+%if ARCH_X86_64
+ mova m10, [o(pd_1567)]
+ mova m15, [o(pd_3784)]
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ mova m7, [r3-16* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m8
+ mova [r4-16*32], m1
+ mova [r4-16*24], m3
+ mova [r3-16*16], m6
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%else
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ psubd m7, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ mova [r3+0*16], m7
+ mova m7, [r3-16* 8] ; t48a
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pmaxsd m6, [r3+0*16]
+ mova [r3+0*16], m6
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pminsd m6, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a
+ mova m2, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m5, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ mova [r3+1*16], m5
+ mova m0, [r3+0*16]
+ mova m5, [r3+2*16]
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m6, m4 ; t55
+ paddd m6, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pmaxsd m3, [r3+1*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pminsd m3, [r3+0*16]
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m6
+ mova [r4-16*32], m1
+ mova m0, [o(pd_2896)]
+ mova m1, [o(pd_2048)]
+ REPX {pmulld x, m0}, m3, m7, m5, m4
+ REPX {paddd x, m1}, m3, m5
+ psubd m6, m3, m7 ; t47
+ paddd m3, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m6, m3, m7, m5
+ mova [r4-16*24], m6
+ mova [r3-16*16], m3
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%endif
+ add r3, 16
+ sub r4, 16
+ cmp r3, r4
+ jl .main_part2_loop
+ sub r3, 4*16
+ ret
+
+.main_end_loop:
+ mova m0, [r3+16*28] ; idct8 0 + n
+.main_end_loop_start:
+ mova m2, [r3+16*12] ; idct32 16 + n
+ mova m3, [r4+16*12] ; idct32 31 - n
+%if ARCH_X86_64
+ mova m1, [r4+16*28] ; idct16 15 - n
+ mova m4, [r4-16* 4] ; idct64 63 - n
+ mova m5, [r3-16* 4] ; idct64 48 + n
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r3-16*20], m2
+ mova [r4+16*28], m1
+ mova [r4-16*20], m4
+ mova [r3+16*28], m3
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m8
+%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
+ mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
+ paddd m4, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m5}, m4, m0
+ REPX {pminsd x, m6}, m4, m0
+ paddd m1, m4, m3 ; idct32 out0 + n
+ psubd m4, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m5}, m1, m4, m3, m0
+ REPX {pminsd x, m6}, m1, m3, m4, m0
+ REPX {paddd x, m7}, m1, m3, m0, m4
+ mova m5, [r4-16* 4] ; idct64 63 - n
+ mova m6, [r3-16* 4] ; idct64 48 + n
+ paddd m2, m1, m5 ; idct64 out0 + n (unshifted)
+ psubd m1, m5 ; idct64 out63 - n (unshifted)
+ paddd m5, m3, m6 ; idct64 out15 - n (unshifted)
+ psubd m3, m6 ; idct64 out48 + n (unshifted)
+ mova [r4+16*28], m1
+ mova [r3+16*28], m3
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m1, [r3-16*20] ; idct64 32 + n
+ mova [r3-16*20], m2
+ mova [r4-16*20], m5
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m4, m1 ; idct64 out31 - n (unshifted)
+ psubd m4, m1 ; idct64 out32 + n (unshifted)
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m4
+%endif
+ sub r4, 16
+ add r3, 16
+ cmp r3, r4
+ jl .main_end_loop
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r4+0*64], m0
+ mova [r4+1*64], m1
+ mova [r4+2*64], m2
+ mova [r4+3*64], m3
+ sub r4, 4*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ paddw m1, m0, [dstq+16*0]
+ paddw m2, m0, [dstq+16*1]
+ paddw m3, m0, [dstq+16*2]
+ paddw m4, m0, [dstq+16*3]
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, 64
+ btc r3d, 16
+ jnc .dconly_loop
+ lea dstq, [dstq+strideq-128]
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
+ 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 4, 1
+ mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
+ mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
+%else
+ DECLARE_REG_TMP 4, 7, 8
+%if WIN64
+ mov [rsp+(8*32+64+1)*16+1*gprsize], r7
+ mov [rsp+64*16+0*gprsize], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ lea t2, [rsp+7*32*16]
+.zero_loop_inner:
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
+ sub t2, 32*16
+ cmp t2, rsp
+ jge .zero_loop_inner
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call .rect2_mul_fast
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call .rect2_mul_fast
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 11 ; pd_1
+%else
+ mova m7, [o(pd_1)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(8*32+64+8)*16]
+%elif WIN64
+ mov r8, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(8*32+3)*16]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.rect2_mul_fast:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3
+ REPX {paddd x, m11}, m0, m1, m2, m3
+%else
+ mova m4, [o(pd_2896)]
+ mova m5, [o(pd_2048)]
+ REPX {pmulld x, m4 }, m0, m1, m2, m3
+ REPX {paddd x, m5 }, m0, m1, m2, m3
+%endif
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t2+0*16+r5*8], m0
+ mova [t2+8*16+r5*8], m2
+ mova [t2+0*16+t0*8], m3
+ mova [t2+0*16+t1*8], m1
+ sub t2, 16*32
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (1+8*32+1*WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
+ 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+ mov [rsp+gprsize*1+(64*9+8)*16], r0
+ mov [rsp+gprsize*2+(64*9+8)*16], r1
+ mov [rsp+gprsize*3+(64*9+8)*16], r2
+ mov [rsp+gprsize*4+(64*9+8)*16], r6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+ mov [rsp+gprsize*1+(64*9+1)*16], r9
+ mov [rsp+gprsize*0+64*16], r0
+%if WIN64
+ mov [rsp+gprsize*2+(64*9+1)*16], r7
+ mov [rsp+gprsize*3+(64*9+1)*16], r8
+%endif
+%endif
+%undef cmp
+
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16]
+.zero_loop_inner:
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
+ sub t4, 64*16
+ cmp t4, rsp
+ jge .zero_loop_inner
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(9*64+8)*16]
+%else
+ mov r0, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(64*8+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+ ; copy of pass=1 tmp-regs
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+%endif
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t4+t0*8], m0
+ mova [t4+t1*8], m1
+ mova [t4+t2*8], m2
+ mova [t4+t3*8], m3
+ sub t4, 16*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
+ (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm
new file mode 100644
index 0000000000..a67f053a61
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx2.asm
@@ -0,0 +1,5542 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+; Note: The order of (at least some of) those constants matter!
+
+const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%macro COEF_PAIR 2
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%endmacro
+
+; ADST-only
+pw_3803_1321: dw 3803, 1321
+pw_m1321_2482: dw -1321, 2482
+pw_2482_3344: dw 2482, 3344
+pw_m3344_3344: dw -3344, 3344
+pw_m3803_3344: dw -3803, 3344
+pw_m3803_m6688: dw -3803, -6688
+pw_2896_m2896: dw 2896, -2896
+
+const pw_5, times 2 dw 5
+const pw_2048, times 2 dw 2048
+const pw_4096, times 2 dw 4096
+const pw_8192, times 2 dw 8192
+const pw_16384, times 2 dw 16384
+const pw_1697x16, times 2 dw 1697*16
+const pw_1697x8, times 2 dw 1697*8
+const pw_2896x8, times 2 dw 2896*8
+const pd_2048, dd 2048
+
+const pw_2896_2896, dw 2896, 2896
+const pw_m2896_2896, dw -2896, 2896
+const pw_1567_3784, dw 1567, 3784
+const pw_m3784_1567, dw -3784, 1567
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4052, 601
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+pw_m799_m4017: dw -799, -4017
+const pw_m1567_m3784, dw -1567, -3784
+pw_m3406_m2276: dw -3406, -2276
+pw_m401_m4076: dw -401, -4076
+pw_m3166_m2598: dw -3166, -2598
+pw_m1931_m3612: dw -1931, -3612
+pw_m3920_m1189: dw -3920, -1189
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+const idct64_mul
+COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520
+COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092
+COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842
+COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+%define o_idct64_offset idct64_mul - (o_base) - 8
+
+SECTION .text
+
+; Code size reduction trickery: Instead of using rip-relative loads with
+; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
+; single rip-relative lea and then address things relative from that with
+; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
+%define o_base deint_shuf + 128
+%define o(x) (r6 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave, 4: coef_regs
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+%if %7 & 4
+ pmaddwd m%2, m%5, m%1
+ pmaddwd m%1, m%6
+%else
+%if %7 & 1
+ vpbroadcastd m%2, [o(pw_%5_%6)]
+ vpbroadcastd m%3, [o(pw_m%6_%5)]
+%else
+ vpbroadcastd m%2, [o(pw_m%6_%5)]
+ vpbroadcastd m%3, [o(pw_%5_%6)]
+%endif
+ pmaddwd m%2, m%1
+ pmaddwd m%1, m%3
+%endif
+ paddd m%2, m%4
+ paddd m%1, m%4
+%if %7 & 2
+ pslld m%2, 4
+ psrld m%1, 12
+ pblendw m%1, m%2, 0xaa
+%else
+ psrad m%2, 12
+ psrad m%1, 12
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+; flags: 1 = swap, 2 = interleave, 4 = coef_regs
+%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
+%if %10 & 1
+ vpbroadcastd m%3, [o(pw_%8_%9)]
+ vpbroadcastd m%4, [o(pw_m%9_%8)]
+ vpbroadcastd xm%2, [o(pw_%6_%7)]
+ vpblendd m%2, m%3, 0xf0
+ vpbroadcastd xm%3, [o(pw_m%7_%6)]
+%else
+ vpbroadcastd m%3, [o(pw_m%9_%8)]
+ vpbroadcastd m%4, [o(pw_%8_%9)]
+ vpbroadcastd xm%2, [o(pw_m%7_%6)]
+ vpblendd m%2, m%3, 0xf0
+ vpbroadcastd xm%3, [o(pw_%6_%7)]
+%endif
+ vpblendd m%3, m%4, 0xf0
+ ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10)
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpckhwd m%3, m%2, m%1
+ punpcklwd m%2, m%1
+%if %7 < 32
+ pmaddwd m%1, m%7, m%2
+ pmaddwd m%4, m%7, m%3
+%else
+ vpbroadcastd m%1, [o(pw_m%7_%6)]
+ pmaddwd m%4, m%3, m%1
+ pmaddwd m%1, m%2
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4
+%if %7 < 32
+ pmaddwd m%3, m%6
+ pmaddwd m%2, m%6
+%else
+ vpbroadcastd m%4, [o(pw_%6_%7)]
+ pmaddwd m%3, m%4
+ pmaddwd m%2, m%4
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%3
+%else
+ packssdw m%2, m%3
+%endif
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
+ psubsw m%3, m%1, m%2
+ paddsw m%2, m%1
+ paddsw m%1, m%4, m%5
+ psubsw m%4, m%5
+%endmacro
+
+%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
+ paddsw m%9, m%2, m%6 ; t4
+ psubsw m%2, m%6 ; t5a
+ paddsw m%10, m%8, m%4 ; t7
+ psubsw m%8, m%4 ; t6a
+ ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
+ ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
+ psubsw m%6, m%1, m%3 ; dct4 out2
+ paddsw m%3, m%1 ; dct4 out1
+ paddsw m%1, m%5, m%7 ; dct4 out0
+ psubsw m%5, m%7 ; dct4 out3
+ psubsw m%7, m%3, m%2 ; out6
+ paddsw m%2, m%3 ; out1
+ paddsw m%3, m%6, m%8 ; out2
+ psubsw m%6, m%8 ; out5
+ psubsw m%8, m%1, m%10 ; out7
+ paddsw m%1, m%10 ; out0
+ paddsw m%4, m%5, m%9 ; out3
+ psubsw m%5, m%9 ; out4
+%endmacro
+
+; in1 = %1, in3 = %2, in5 = %3, in7 = %4
+; in9 = %5, in11 = %6, in13 = %7, in15 = %8
+%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
+ psubsw m%9, m%2, m%6 ; t13
+ paddsw m%6, m%2 ; t12
+ psubsw m%2, m%8, m%4 ; t14
+ paddsw m%8, m%4 ; t15
+ psubsw m%4, m%7, m%3 ; t10
+ paddsw m%3, m%7 ; t11
+ psubsw m%7, m%1, m%5 ; t9
+ paddsw m%1, m%5 ; t8
+ ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
+ ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
+ psubsw m%5, m%1, m%3 ; t11a
+ paddsw m%1, m%3 ; t8a
+ psubsw m%3, m%7, m%4 ; t13
+ paddsw m%7, m%4 ; t14
+ psubsw m%4, m%8, m%6 ; t12a
+ paddsw m%8, m%6 ; t15a
+ psubsw m%6, m%2, m%9 ; t10
+ paddsw m%2, m%9 ; t9
+ ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
+ ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12
+%endmacro
+
+%macro WRAP_XMM 1+
+ INIT_XMM cpuname
+ %1
+ INIT_YMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ; in1 in3
+ punpcklqdq m0, m1 ; in0 in2
+ psubw m2, m0, m3
+ paddw m0, m3
+ punpckhqdq m2, m2 ; t2 t2
+ punpcklqdq m0, m0 ; t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1
+ psubw m1, m3 ; t1 t3
+ psubw m0, m1 ; ____ out0
+ paddw m2, m1 ; out3 ____
+%endmacro
+
+INIT_XMM avx2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ vpblendd m0, m2, 0x03
+ ITX4_END 3, 0, 2, 1, 0
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea r6, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd ; 0
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m2, m1, m0
+ punpckhwd m3, m1, m0
+ vpbroadcastd m5, [o(pw_m3344_3344)]
+ vpbroadcastd m0, [o(pw_3803_1321)]
+ vpbroadcastd m4, [o(pw_m1321_2482)]
+ pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2
+ psrld m5, 16
+ pmaddwd m0, m2
+ pmaddwd m2, m4
+ pmaddwd m5, m3 ; 3344*in0
+ paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
+ vpbroadcastd m4, [o(pw_2482_3344)]
+ vpbroadcastd m5, [o(pw_m3803_3344)]
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
+ vpbroadcastd m0, [o(pw_m3803_m6688)]
+ pmaddwd m3, m0
+ vpbroadcastd m0, [o(pd_2048)]
+ paddd m2, m0
+ paddd m1, m0
+ paddd m0, m4
+ paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
+ paddd m2, m4
+ paddd m2, m3
+ REPX {psrad x, 12}, m1, m2, m0, m5
+ packssdw m0, m5 ; out0 out1
+ packssdw m1, m2 ; out2 out3
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+cglobal_label .main
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro WRITE_4X8 2 ; coefs[1-2]
+ movd xm4, [dstq+strideq*0]
+ pinsrd xm4, [dstq+strideq*1], 1
+ movd xm5, [dstq+strideq*2]
+ pinsrd xm5, [dstq+r3 ], 1
+ pinsrd xm4, [r2 +strideq*0], 2
+ pinsrd xm4, [r2 +strideq*1], 3
+ pinsrd xm5, [r2 +strideq*2], 2
+ pinsrd xm5, [r2 +r3 ], 3
+ pmovzxbw m4, xm4
+ pmovzxbw m5, xm5
+ paddw m4, m%1
+ paddw m5, m%2
+ packuswb m4, m5
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+strideq*2], xm4, 2
+ pextrd [dstq+r3 ], xm4, 3
+ movd [r2 +strideq*0], xm5
+ pextrd [r2 +strideq*1], xm5, 1
+ pextrd [r2 +strideq*2], xm5, 2
+ pextrd [r2 +r3 ], xm5, 3
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ vpbroadcastd m6, [o(pd_2048)]
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ vpbroadcastd m1, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5
+%if mmsize > 16
+ vbroadcasti128 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ shufps m2, m4, m0, q1032 ; t7 t6
+ vpblendd m4, m0, 0xcc ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti128 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ pshuflw m1, m1, q2301
+ pshufhw m1, m1, q2301
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ pmaddwd m2, m5, m3
+ pmaddwd m5, m1
+ paddd m2, m6
+ paddd m5, m6
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ pmaddwd m3, m5
+ pmaddwd m1, m5
+ paddd m3, m6
+ paddd m1, m6
+ psrad m3, 12
+ psrad m1, 12
+ packssdw m1, m3 ; out2 -out3
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+%else
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ vpbroadcastd m5, [o(pw_2896x8)]
+ vpblendd m3, m0, m4, 0x33 ; out6 -out7
+ vpblendd m0, m4, 0xcc ; out0 -out1
+ shufps m4, m2, m1, q1032 ; t3 t7
+ vpblendd m1, m2, 0x33 ; t2 t6
+ psubsw m2, m1, m4 ; t2-t3 t6-t7
+ paddsw m1, m4 ; t2+t3 t6+t7
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx2
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti128 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ vpblendd m4, m5, 0xcc
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ WIN64_RESTORE_XMM
+ pxor m2, m2
+ mova [cq+32*0], m2
+ mova [cq+32*1], m2
+.end3:
+ lea r2, [dstq+strideq*4]
+ lea r3, [strideq*3]
+ WRITE_4X8 0, 1
+ RET
+ALIGN function_align
+.main_pass1:
+ WRAP_XMM IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m2, [cq+32*0], q3120
+ vpermq m0, [cq+32*1], q3120
+ vpbroadcastd m3, [o(pw_2896x8)]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ pmulhrsw m2, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m2
+ paddsw m1, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ movd xm3, [o(pw_2048)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm2
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm3
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m0, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784
+ ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
+ ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a
+ psubsw m4, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+ paddsw m3, m2, m0 ; t9 t14
+ psubsw m2, m0 ; t10 t13
+%if mmsize > 16
+ vbroadcasti128 m0, [o(deint_shuf)]
+%else
+ mova m0, [o(deint_shuf)]
+%endif
+ pshufb m8, m0
+ pshufb m7, m0
+ pshufb m3, m0
+ ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1
+ vpbroadcastd m0, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5
+ vpbroadcastd m0, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ shufps m5, m4, m2, q1032 ; t12 t13a
+ vpblendd m4, m2, 0xcc ; t11 t10a
+ shufps m2, m7, m1, q1032 ; t7 t6
+ vpblendd m7, m1, 0xcc ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(idct_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m2, m3
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vextracti128 xm4, m0, 1
+ vextracti128 xm5, m1, 1
+ vextracti128 xm6, m2, 1
+ vextracti128 xm7, m3, 1
+ call .main
+ vinserti128 m0, xm4, 1
+ vinserti128 m1, xm5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti128 m2, xm6, 1
+ vinserti128 m3, xm7, 1
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ REPX {pmulhrsw x, m5}, m4, m2, m3, m0
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ vpblendd m4, m1, m0, 0x33
+ vpblendd m0, m2, 0x33
+ vpblendd m2, m3, 0x33
+ vpblendd m3, m1, 0x33
+ vpermq m0, m0, q2031
+ vpermq m1, m2, q1302
+ vpermq m2, m3, q3120
+ vpermq m3, m4, q0213
+ psubw m6, m7, m5
+.end:
+ vpblendd m5, m6, 0xcc
+.end2:
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+.end3:
+ lea r2, [dstq+strideq*8]
+ lea r3, [strideq*3]
+ WRITE_4X8 0, 1
+ lea dstq, [dstq+strideq*4]
+ lea r2, [r2 +strideq*4]
+ WRITE_4X8 2, 3
+ RET
+ALIGN function_align
+.main:
+ vpblendd m4, m1, m0, 0xcc
+ vpblendd m1, m0, 0x33
+ vpblendd m5, m2, m3, 0xcc
+ vpblendd m2, m3, 0x33
+ vperm2i128 m3, m5, m2, 0x31
+ vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1
+ vperm2i128 m4, m1, m4, 0x31
+ vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5
+ pshufd m3, m3, q1032 ; in15 in12 in13 in14
+ pshufd m2, m4, q1032 ; in11 in8 in9 in10
+cglobal_label .main2
+ vpbroadcastd m8, [o(pd_2048)]
+ pxor m7, m7
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ punpckhwd m3, m2, m1 ; in8 in7 in10 in5
+ punpcklwd m1, m2 ; in4 in11 in6 in9
+ ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3
+ ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
+ ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
+ ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m1, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
+ psubw m6, m7, m5
+ ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
+ vpbroadcastd m6, [o(pw_m3784_1567)]
+ vpbroadcastd m5, [o(pw_1567_3784)]
+ psubsw m4, m0, m1 ; t5 t4 t7 t6
+ paddsw m0, m1 ; t1 t0 t3 t2
+ psubsw m1, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ psubw m3, m7, m6 ; pw_3784_m1567
+ vpblendd m6, m3, 0xf0
+ ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
+ vbroadcasti128 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a
+ vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a
+ vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
+ vinserti128 m4, xm1, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m1, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m4, m2 ; -out3 out12 out2 -out13
+ psubsw m4, m2 ; t6 t7 t14a t15a
+ shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
+ vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ vpbroadcastd m6, [o(pw_2896_2896)]
+ punpcklwd m1, m4, m2
+ punpckhwd m4, m2
+ pmaddwd m2, m5, m4
+ pmaddwd m4, m6
+ pmaddwd m5, m1
+ pmaddwd m1, m6
+ REPX {paddd x, m8}, m5, m1, m2, m4
+ REPX {psrad x, 12}, m5, m2, m1, m4
+ packssdw m2, m5 ; -out11 out8 out10 -out9
+ packssdw m1, m4 ; -out7 out4 out6 -out5
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpcklwd m4, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m3, m2
+ punpckhwd m3, m2
+ REPX {pmulhrsw x, m5}, m4, m1, m0, m3
+ punpckldq m2, m3, m1
+ punpckhdq m3, m1
+ punpckhdq m1, m0, m4
+ punpckldq m0, m4
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ vpbroadcastd m6, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ vpblendd m4, m0, m2, 0x33
+ vpblendd m0, m1, 0xcc
+ vpblendd m1, m3, 0xcc
+ vpblendd m2, m3, 0x33
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q0213
+ vpermq m2, m2, q2031
+ vpermq m3, m4, q1302
+ psubw m5, m7, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m3, [cq+32*0]
+ mova m2, [cq+32*1]
+ mova m4, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m8, [o(pw_1697x8)]
+ pcmpeqw m0, m0 ; -1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m5
+ punpckhwd m4, m5
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ pmulhrsw m8, m4
+ pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is
+ pxor m1, m9 ; unsigned. as long as both signs are equal
+ pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the
+ pxor m2, m9 ; pmulhrsw result will become 0 which causes
+ pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
+ pxor m3, m9 ; we explicitly deal with that case here.
+ pcmpeqw m0, m4
+ pxor m4, m0
+ pavgw m1, m5
+ pavgw m2, m6
+ pavgw m3, m7
+ pavgw m4, m8
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m6, m8, m1
+ pmulhrsw m7, m8, m2
+ pmulhrsw m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m6
+ paddsw m2, m7
+ paddsw m3, m8
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti128 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti128 m4, [o(deint_shuf)]
+ vinserti128 m3, m1, xm3, 1
+ vinserti128 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pxor m3, m3
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ WIN64_RESTORE_XMM
+.end3:
+ pxor m2, m2
+ mova [cq+32*0], m2
+ mova [cq+32*1], m2
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+cglobal_label .main
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ pmulhrsw xm0, xm2
+.dconly2:
+ movd xm2, [pw_2048]
+ pmulhrsw xm0, xm1
+ lea r2, [strideq*3]
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti128 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti128 m0, m4, xm1, 1
+ vperm2i128 m2, m4, m1, 0x31
+ vinserti128 m1, m5, xm3, 1
+ vperm2i128 m3, m5, m3, 0x31
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ pxor m3, m3
+ psubw m3, m5 ; negate odd elements during rounding
+ pmulhrsw m4, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m5
+ pmulhrsw m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vperm2i128 m2, m3, m0, 0x31
+ vinserti128 m0, m3, xm0, 1
+ vperm2i128 m3, m4, m1, 0x31
+ vinserti128 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ WIN64_RESTORE_XMM
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pxor m0, m0
+ psubw m0, m5
+ pmulhrsw m4, m0
+ pmulhrsw m3, m5
+ pmulhrsw m2, m0
+ pmulhrsw m1, m5
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ vinserti128 m1, m0, xm3, 1
+ vperm2i128 m3, m0, m3, 0x31
+ vinserti128 m0, m4, xm2, 1
+ vperm2i128 m2, m4, m2, 0x31
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti128 m3, [cq+16*4], 1
+ vinserti128 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti128 m4, [cq+16*6], 1
+ vinserti128 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [o(pw_16384)]
+.pass1_end:
+ vperm2i128 m9, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ vperm2i128 m8, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+.pass1_end2:
+ punpckhwd m7, m5, m6
+ punpcklwd m5, m6
+ punpcklwd m6, m8, m9
+ punpckhwd m8, m9
+ REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpckldq m6, m7, m8
+ punpckhdq m7, m8
+ jmp tx2q
+.pass2:
+ call .main
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+ vpbroadcastd m8, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+.end3:
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 8, 9
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 0, 1
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 4, 5, 0, 1
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 6, 7, 0, 1
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+ vpbroadcastd m10, [o(pw_16384)]
+ pslld m9, m10, 17
+ psubw m10, m9 ; 16384, -16384
+ jmp m(idct_8x16_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ vpbroadcastd m9, [o(pw_2048)]
+ vpbroadcastd xm8, [o(pw_4096)]
+ psubw m8, m9
+ REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+ REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+ jmp m(idct_8x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ REPX {pshufd x, x, q1032}, m7, m1, m5, m3
+.main2:
+ vpbroadcastd m10, [o(pd_2048)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ vpbroadcastd m12, [o(pw_799_4017)]
+ pxor m9, m9
+ ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
+ psubw m8, m9, m11 ; pw_4017_m799
+ ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ vpbroadcastd m12, [o(pw_3406_2276)]
+ ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
+ psubw m8, m9, m11 ; pw_2276_m3406
+ ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ vpbroadcastd m12, [o(pw_1567_3784)]
+ ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a
+ psubw m6, m9, m11 ; pw_3784_m1567
+ ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a
+ vpbroadcastd m11, [o(pw_m1567_3784)]
+ vpbroadcastd m12, [o(pw_3784_1567)]
+ ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14
+ psubw m6, m9, m11 ; pw_1567_m3784
+ ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12
+ vbroadcasti128 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ vpblendd m0, m6, 0x33 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ pmaddwd m9, m8, m11 ; -out11
+ pmaddwd m2, m12, m5 ; -out5
+ pmaddwd m5, m8 ; out10
+ pmaddwd m11, m12 ; out4
+ REPX {paddd x, m10}, m9, m5, m2, m11
+ REPX {psrad x, 12 }, m9, m5, m2, m11
+ packssdw m5, m9 ; out10 -out11
+ packssdw m2, m11 ; -out5 out4
+ pmaddwd m11, m8, m3 ; out8
+ vpbroadcastd m8, [o(pw_2896_m2896)]
+ pmaddwd m3, m12 ; -out7
+ pmaddwd m8, m4 ; -out9
+ pmaddwd m4, m12 ; out6
+ REPX {paddd x, m10}, m11, m3, m8, m4
+ REPX {psrad x, 12 }, m11, m3, m8, m4
+ packssdw m3, m4 ; -out7 out6
+ packssdw m4, m11, m8 ; out8 -out9
+ vpbroadcastd m10, [o(pw_16384)]
+ pxor m9, m9
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m11, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ vpblendd m3, m4, 0xcc ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m11 ; -out5 out4
+ psubsw m5, m11 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+ vpbroadcastd m9, [o(pw_16384)]
+ pslld m10, m9, 17
+ psubw m10, m9 ; -16384, 16384
+ vperm2i128 m9, m4, m0, 0x31
+ vinserti128 m0, m4, xm0, 1
+ vperm2i128 m8, m5, m1, 0x31
+ vinserti128 m4, m5, xm1, 1
+ vperm2i128 m5, m7, m3, 0x31
+ vinserti128 m3, m7, xm3, 1
+ vinserti128 m1, m6, xm2, 1
+ vperm2i128 m6, m6, m2, 0x31
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m3, m1
+ punpckhwd m3, m1
+ jmp m(idct_8x16_internal_8bpc).pass1_end2
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m8, [o(pw_2048)]
+ vpbroadcastd xm9, [o(pw_4096)]
+ psubw m8, m9
+ vpermq m9, m0, q3120
+ vpermq m0, m7, q2031
+ vpermq m7, m1, q3120
+ vpermq m1, m6, q2031
+ vpermq m6, m2, q3120
+ vpermq m2, m5, q2031
+ vpermq m5, m3, q3120
+ vpermq m3, m4, q2031
+ pmulhrsw m0, m8
+ pmulhrsw m1, m8
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ pmulhrsw m4, m5, m8
+ pmulhrsw m5, m6, m8
+ pmulhrsw m6, m7, m8
+ pmulhrsw m7, m9, m8
+ jmp m(idct_8x16_internal_8bpc).end3
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*2]
+ add cq, 16*8
+ vinserti128 m3, [cq+16*0], 1
+ vinserti128 m2, [cq+16*2], 1
+ vpbroadcastd m9, [o(pw_2896x8)]
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*2]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*6], 1
+ mova xm7, [cq-16*7]
+ mova xm6, [cq-16*5]
+ vinserti128 m7, [cq+16*1], 1
+ vinserti128 m6, [cq+16*3], 1
+ mova xm8, [cq-16*3]
+ mova xm0, [cq-16*1]
+ vinserti128 m8, [cq+16*5], 1
+ vinserti128 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m5
+ punpckhwd m4, m5
+ punpcklwd m5, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m8, m0
+ punpckhwd m8, m0
+ REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpckldq m6, m7, m8
+ punpckhdq m7, m8
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti128 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ or r3d, 4
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [pw_2048] ; intentionally rip-relative
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m3, m3
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ punpckhbw m2, m1, m3
+ punpcklbw m1, m3
+ paddw m2, m0
+ paddw m1, m0
+ packuswb m1, m2
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vinserti128 m6, m2, xm6, 1
+ vinserti128 m2, m0, xm4, 1
+ vinserti128 m0, m1, xm5, 1
+ vinserti128 m1, m3, xm7, 1
+ punpcklwd m3, m2, m6
+ punpckhwd m2, m6
+ vpbroadcastd m6, [o(pw_16384)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ mova m1, m6
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call .main
+ jmp m(iadst_16x4_internal_8bpc).end
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q1230
+ vpermq m3, [cq+32*3], q2103
+ vpermq m1, [cq+32*1], q1230
+ vpermq m2, [cq+32*2], q2103
+ call m(iadst_4x16_internal_8bpc).main2
+ call m(iadst_4x16_internal_8bpc).main_pass1_end
+ punpcklwd m4, m3, m1
+ punpcklwd m5, m2, m0
+ punpckhwd m0, m1
+ punpckhwd m2, m3
+ vpbroadcastd m1, [o(pw_16384)]
+ vinserti128 m3, m0, xm2, 1
+ vperm2i128 m2, m0, m2, 0x31
+ vinserti128 m0, m4, xm5, 1
+ vperm2i128 m4, m4, m5, 0x31
+ psubw m6, m7, m1
+.pass1_end:
+ pmulhrsw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m4, m1
+ pmulhrsw m0, m6
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpbroadcastd m4, [o(pw_2048)]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
+.end2:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+.end3:
+ WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*2]
+ WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m6, [o(pw_m3344_3344)]
+ vpbroadcastd m7, [o(pw_3803_1321)]
+ vpbroadcastd m8, [o(pw_m1321_2482)]
+ vpbroadcastd m9, [o(pw_2482_3344)]
+ punpcklwd m4, m2, m0 ; in2 in0 l
+ punpckhwd m2, m0 ; in2 in0 h
+ psrld m5, m6, 16
+ pmaddwd m10, m6, m4 ; t2:02 l
+ pmaddwd m6, m2 ; t2:02 h
+ pmaddwd m0, m7, m4 ; t0:02 l
+ pmaddwd m7, m2 ; t0:02 h
+ pmaddwd m4, m8 ; t1:02 l
+ pmaddwd m8, m2 ; t1:02 h
+ punpckhwd m2, m3, m1 ; in3 in1 h
+ punpcklwd m3, m1 ; in3 in1 l
+ pmaddwd m1, m5, m2 ; t2:3 h
+ pmaddwd m5, m3 ; t2:3 l
+ paddd m6, m1
+ vpbroadcastd m1, [o(pd_2048)]
+ paddd m10, m5
+ pmaddwd m5, m9, m3
+ pmaddwd m9, m2
+ paddd m0, m1
+ paddd m7, m1
+ paddd m0, m5 ; t0 + t3 + 2048 l
+ paddd m7, m9 ; t0 + t3 + 2048 h
+ vpbroadcastd m9, [o(pw_m3803_3344)]
+ pmaddwd m5, m9, m2
+ pmaddwd m9, m3
+ paddd m10, m1 ; t2 + 2048 l
+ paddd m6, m1 ; t2 + 2048 h
+ paddd m5, m1 ; t1:13 + 2048 h
+ paddd m1, m9 ; t1:13 + 2048 l
+ vpbroadcastd m9, [o(pw_m3803_m6688)]
+ pmaddwd m2, m9
+ pmaddwd m3, m9
+ paddd m5, m8 ; t1 + t3 + 2048 h
+ paddd m1, m4 ; t1 + t3 + 2048 l
+ paddd m8, m7
+ paddd m4, m0
+ paddd m2, m8 ; t0 + t1 - t3 + 2048 h
+ paddd m3, m4 ; t0 + t1 - t3 + 2048 l
+ REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
+ packssdw m0, m7
+ packssdw m1, m5
+ packssdw m3, m2
+ packssdw m2, m10, m6
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q1230
+ vpermq m3, [cq+32*3], q2103
+ vpermq m1, [cq+32*1], q1230
+ vpermq m2, [cq+32*2], q2103
+ call m(iadst_4x16_internal_8bpc).main2
+ call m(iadst_4x16_internal_8bpc).main_pass1_end
+ punpckhwd m4, m3, m2
+ punpckhwd m5, m1, m0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ vpbroadcastd m6, [o(pw_16384)]
+ vinserti128 m3, m0, xm1, 1
+ vperm2i128 m2, m0, m1, 0x31
+ vinserti128 m0, m4, xm5, 1
+ vperm2i128 m4, m4, m5, 0x31
+ psubw m1, m7, m6
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m4, [o(pw_2048)]
+ REPX {pmulhrsw x, m4}, m3, m2, m1, m0
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*2]
+ WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1
+ RET
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm4, [cq+16*1]
+ vinserti128 m2, [cq+16*4], 1
+ vinserti128 m4, [cq+16*5], 1
+ mova xm0, [cq+16*2]
+ mova xm1, [cq+16*3]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ punpcklwd m3, m2, m4
+ punpckhwd m2, m4
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ pmulhrsw m0, m7, m1
+ pmulhrsw m5, m7, m2
+ pmulhrsw m6, m7, m3
+ pmulhrsw m7, m4
+ REPX {pmulhrsw x, m8}, m0, m5, m6, m7
+ paddsw m1, m0
+ paddsw m2, m5
+ paddsw m3, m6
+ paddsw m4, m7
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 3120
+ call m(idct_8x16_internal_8bpc).main
+ vpbroadcastd m10, [o(pw_16384)]
+ punpckhwd m8, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m9, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ REPX {pmulhrsw x, m10}, m8, m1, m4, m6
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m2, m9, m5
+ punpckhwd m3, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m9, m4
+ punpckhwd m9, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m8
+ punpckhdq m3, m8
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m9, m5
+ punpckhdq m9, m5
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m8, 0x31
+ vinserti128 m2, xm8, 1
+ vperm2i128 m7, m3, m9, 0x31
+ vinserti128 m3, xm9, 1
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m8}, m0, m2, m4, m6
+.end2:
+ REPX {pmulhrsw x, m8}, m1, m3, m5, m7
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+.end3:
+ pxor m0, m0
+ REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+.end4:
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 1302
+ call m(iadst_8x16_internal_8bpc).main2
+ call m(iadst_8x16_internal_8bpc).main_pass1_end
+ psubw m11, m9, m10
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m9, m4, m6
+ punpckhwd m4, m6
+ punpckhwd m6, m5, m7
+ punpcklwd m5, m7
+ REPX {pmulhrsw x, m11}, m8, m1, m4, m6
+ jmp m(idct_16x8_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ pxor m8, m8
+ psubw m8, m9
+ REPX {pmulhrsw x, m9}, m0, m2, m4, m6
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
+ ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
+ psubsw m8, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m0, m4 ; t4
+ paddsw m0, m4 ; t0
+ psubsw m4, m5, m1 ; t7
+ paddsw m5, m1 ; t3
+ psubsw m1, m7, m3 ; t5
+ paddsw m7, m3 ; t1
+ ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
+ psubsw m9, m6, m8 ; t7
+ paddsw m6, m8 ; out6
+ psubsw m3, m7, m5 ; t3
+ paddsw m7, m5 ; -out7
+ psubsw m5, m0, m2 ; t2
+ paddsw m0, m2 ; out0
+ psubsw m2, m1, m4 ; t6
+ paddsw m1, m4 ; -out1
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ punpckhwd m4, m3, m5
+ punpcklwd m3, m5
+ pmaddwd m5, m11, m4
+ pmaddwd m4, m12
+ pmaddwd m8, m11, m3
+ pmaddwd m3, m12
+ REPX {paddd x, m10}, m5, m4, m8, m3
+ REPX {psrad x, 12 }, m5, m8, m4, m3
+ packssdw m3, m4 ; -out3
+ packssdw m4, m8, m5 ; out4
+ punpcklwd m5, m9, m2
+ punpckhwd m9, m2
+ pmaddwd m2, m12, m5
+ pmaddwd m5, m11
+ pmaddwd m12, m9
+ pmaddwd m11, m9
+ REPX {paddd x, m10}, m2, m5, m12, m11
+ REPX {psrad x, 12 }, m2, m12, m5, m11
+ packssdw m2, m12 ; out2
+ packssdw m5, m11 ; -out5
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ vpbroadcastd m8, [o(pw_2896x8)]
+ psubsw m4, m5, m3
+ paddsw m3, m5
+ psubsw m5, m2, m9
+ paddsw m2, m9
+ pmulhrsw m2, m8 ; out2
+ pmulhrsw m3, m8 ; -out3
+ pmulhrsw m4, m8 ; out4
+ pmulhrsw m5, m8 ; -out5
+ vpbroadcastd m9, [o(pw_2048)]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 1302
+ call m(iadst_8x16_internal_8bpc).main2
+ call m(iadst_8x16_internal_8bpc).main_pass1_end
+ psubw m9, m10
+ punpcklwd m8, m6, m4
+ punpckhwd m6, m4
+ punpcklwd m4, m7, m5
+ punpckhwd m7, m5
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1
+ punpckhwd m1, m2, m0
+ punpcklwd m2, m0
+ REPX {pmulhrsw x, m10}, m8, m4, m5, m1
+ REPX {pmulhrsw x, m9 }, m6, m7, m3, m2
+ punpcklwd m0, m7, m4
+ punpckhwd m7, m4
+ punpckhwd m4, m6, m8
+ punpcklwd m6, m8
+ punpckhwd m8, m3, m5
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m1
+ punpckhwd m2, m1
+ punpckhdq m1, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckhdq m4, m3, m5
+ punpckldq m3, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ vinserti128 m2, m6, xm5, 1
+ vperm2i128 m6, m5, 0x31
+ vperm2i128 m5, m1, m4, 0x31
+ vinserti128 m1, xm4, 1
+ vperm2i128 m4, m0, m3, 0x31
+ vinserti128 m0, xm3, 1
+ vinserti128 m3, m7, xm8, 1
+ vperm2i128 m7, m8, 0x31
+ jmp tx2q
+.pass2:
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ pxor m8, m8
+ psubw m8, m9
+ pmulhrsw m10, m7, m8
+ pmulhrsw m7, m0, m9
+ pmulhrsw m0, m6, m9
+ pmulhrsw m6, m1, m8
+ pmulhrsw m1, m5, m8
+ pmulhrsw m5, m2, m9
+ pmulhrsw m2, m4, m9
+ pmulhrsw m4, m3, m8
+ lea r3, [strideq*3]
+ WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1
+ WRITE_16X2 1, 2, 0, 1, strideq*2, r3
+ jmp m(idct_16x8_internal_8bpc).end3
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ mova xm7, [cq+16*0]
+ mova xm2, [cq+16*1]
+ add cq, 16*8
+ vpbroadcastd m3, [o(pw_2896x8)]
+ vinserti128 m7, [cq+16*0], 1
+ vinserti128 m2, [cq+16*1], 1
+ mova xm6, [cq-16*6]
+ mova xm4, [cq-16*5]
+ vinserti128 m6, [cq+16*2], 1
+ vinserti128 m4, [cq+16*3], 1
+ mova xm8, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m8, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm0, [cq-16*2]
+ mova xm1, [cq-16*1]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m11, [o(pw_16384)]
+ REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
+ punpcklwd m3, m7, m2
+ punpckhwd m7, m2
+ punpcklwd m2, m6, m4
+ punpckhwd m6, m4
+ punpcklwd m4, m8, m5
+ punpckhwd m8, m5
+ punpcklwd m5, m0, m1
+ punpckhwd m0, m1
+ punpckldq m1, m3, m2
+ punpckhdq m3, m2
+ punpckldq m2, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m7, m6
+ punpckhdq m7, m6
+ punpckldq m6, m8, m0
+ punpckhdq m8, m0
+ REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m6
+ punpckhqdq m5, m6
+ punpcklqdq m6, m7, m8
+ punpckhqdq m7, m8
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_4096)]
+ jmp m(idct_16x8_internal_8bpc).end
+
+%define o_base pw_5 + 128
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_16X16_LOAD_COEFS 0
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ add cq, 32*8
+ mova m4, [cq-32*4]
+ mova m5, [cq-32*3]
+ mova m6, [cq-32*2]
+ mova m7, [cq-32*1]
+ mova m8, [cq+32*0]
+ mova m9, [cq+32*1]
+ mova m10, [cq+32*2]
+ mova m11, [cq+32*3]
+ mova m12, [cq+32*4]
+ mova m13, [cq+32*5]
+ mova m14, [cq+32*6]
+ mova m15, [cq+32*7]
+ mova [rsp], m15
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call .main
+.pass1_end:
+ vpbroadcastd m1, [o(pw_8192)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+.pass1_end2:
+ vextracti128 [rsp+16*4], m0, 1
+ mova [rsp+16*0], xm0
+ REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ vperm2i128 m8, m1, m9, 0x31
+ vinserti128 m1, xm9, 1
+ vperm2i128 m9, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vperm2i128 m10, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ vperm2i128 m11, m4, m12, 0x31
+ vinserti128 m4, xm12, 1
+ vperm2i128 m12, m5, m13, 0x31
+ vinserti128 m5, xm13, 1
+ vperm2i128 m13, m6, m14, 0x31
+ vinserti128 m6, xm14, 1
+ vperm2i128 m14, m7, m15, 0x31
+ vinserti128 m7, xm15, 1
+ mova m15, [rsp+32*2]
+.pass1_end3:
+ punpcklwd m0, m9, m10
+ punpckhwd m9, m10
+ punpcklwd m10, m15, m8
+ punpckhwd m15, m8
+ punpckhwd m8, m11, m12
+ punpcklwd m11, m12
+ punpckhwd m12, m13, m14
+ punpcklwd m13, m14
+ punpckhdq m14, m11, m13
+ punpckldq m11, m13
+ punpckldq m13, m15, m9
+ punpckhdq m15, m9
+ punpckldq m9, m10, m0
+ punpckhdq m10, m0
+ punpckhdq m0, m8, m12
+ punpckldq m8, m12
+ punpcklqdq m12, m13, m8
+ punpckhqdq m13, m8
+ punpcklqdq m8, m9, m11
+ punpckhqdq m9, m11
+ punpckhqdq m11, m10, m14
+ punpcklqdq m10, m14
+ punpcklqdq m14, m15, m0
+ punpckhqdq m15, m0
+ mova m0, [rsp]
+ mova [rsp], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpbroadcastd m1, [o(pw_2048)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ mova [rsp], m6
+.end2:
+ REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3
+.end3:
+ pxor m2, m2
+ REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 0, 1, strideq*2, r3
+ REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m15, [o(pd_2048)]
+ mova [rsp+gprsize+32*1], m1
+ mova [rsp+gprsize+32*2], m9
+ IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15
+ mova m1, [rsp+gprsize+32*2] ; in9
+ mova [rsp+gprsize+32*2], m14 ; tmp7
+ mova m9, [rsp+gprsize+32*1] ; in1
+ mova [rsp+gprsize+32*1], m10 ; tmp5
+ mova m14, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m6 ; tmp3
+ IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
+ mova m6, [rsp+gprsize+32*1] ; tmp5
+ psubsw m15, m0, m14 ; out15
+ paddsw m0, m14 ; out0
+ psubsw m14, m2, m13 ; out14
+ paddsw m2, m13 ; out1
+ mova [rsp+gprsize+32*1], m2
+ psubsw m13, m4, m11 ; out13
+ paddsw m2, m4, m11 ; out2
+ psubsw m11, m8, m7 ; out11
+ paddsw m4, m8, m7 ; out4
+ mova m7, [rsp+gprsize+32*2] ; tmp7
+ psubsw m10, m6, m5 ; out10
+ paddsw m5, m6 ; out5
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; out7
+ psubsw m9, m12, m3 ; out9
+ paddsw m6, m12, m3 ; out6
+ mova m3, [rsp+gprsize+32*0] ; tmp3
+ psubsw m12, m3, m1 ; out12
+ paddsw m3, m1 ; out3
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call .main
+ call .main_pass1_end
+ pmulhrsw m0, m1, [cq+32*0]
+ pmulhrsw m2, m1, [cq+32*1]
+ REPX {pmulhrsw x, m1}, m4, m6, m8, m10
+ pmulhrsw m12, m1, [cq+32*2]
+ pmulhrsw m14, m1, [cq+32*3]
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+ pxor m8, m8
+ psubw m1, m8, m1
+ jmp m(idct_16x16_internal_8bpc).pass1_end2
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ mova [rsp+32*0], m6
+ pxor m6, m6
+ psubw m1, m6, m1
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m15, [o(pd_2048)]
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*2], m4
+ ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2
+ ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
+ ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
+ ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
+ psubsw m0, m2, m10 ; t10a
+ paddsw m2, m10 ; t2a
+ psubsw m10, m13, m5 ; t11a
+ paddsw m13, m5 ; t3a
+ psubsw m5, m6, m14 ; t14a
+ paddsw m6, m14 ; t6a
+ psubsw m14, m9, m1 ; t15a
+ paddsw m9, m1 ; t7a
+ ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
+ ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
+ psubsw m1, m10, m14 ; t14a
+ paddsw m10, m14 ; t10a
+ psubsw m14, m0, m5 ; t15a
+ paddsw m0, m5 ; t11a
+ psubsw m5, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m13, m9 ; t7
+ paddsw m13, m9 ; t3
+ ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
+ ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
+ mova m9, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m10 ; t10a
+ mova m4, [rsp+gprsize+32*1] ; in0
+ mova [rsp+gprsize+32*1], m6 ; t6a
+ mova m6, [rsp+gprsize+32*2] ; in4
+ mova [rsp+gprsize+32*2], m2 ; t2
+ ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0
+ ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4
+ ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8
+ ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12
+ psubsw m10, m4, m8 ; t8a
+ paddsw m8, m4 ; t0a
+ psubsw m4, m9, m7 ; t9a
+ paddsw m9, m7 ; t1a
+ psubsw m7, m6, m12 ; t12a
+ paddsw m6, m12 ; t4a
+ psubsw m12, m11, m3 ; t13a
+ paddsw m11, m3 ; t5a
+ ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8
+ ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13
+ psubsw m3, m9, m11 ; t5
+ paddsw m9, m11 ; t1
+ psubsw m11, m4, m12 ; t12a
+ paddsw m4, m12 ; t8a
+ paddsw m12, m8, m6 ; t0
+ psubsw m8, m6 ; t4
+ paddsw m6, m10, m7 ; t9a
+ psubsw m10, m7 ; t13a
+ ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12
+ mova m7, [rsp+gprsize+32*0] ; t10a
+ mova m2, [rsp+gprsize+32*1] ; t6a
+ paddsw m15, m9, m13 ; -out15
+ psubsw m9, m13 ; t3a
+ paddsw m13, m11, m1 ; -out13
+ psubsw m11, m1 ; t15a
+ psubsw m1, m4, m7 ; t10
+ paddsw m7, m4 ; -out1
+ psubsw m4, m3, m2 ; t6
+ paddsw m3, m2 ; -out3
+ paddsw m2, m10, m14 ; out2
+ psubsw m10, m14 ; t14a
+ paddsw m14, m6, m0 ; out14
+ psubsw m6, m0 ; t11
+ mova m0, [rsp+gprsize+32*2] ; t2
+ mova [rsp+gprsize+32*1], m7
+ psubsw m7, m12, m0 ; t2a
+ paddsw m0, m12 ; out0
+ paddsw m12, m8, m5 ; out12
+ psubsw m8, m5 ; t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [cq+32*0], m0
+ mova [cq+32*1], m2
+ mova [cq+32*2], m12
+ mova [cq+32*3], m14
+ vpbroadcastd m14, [pw_m2896_2896]
+ vpbroadcastd m12, [pw_2896_2896]
+ vpbroadcastd m2, [pd_2048]
+ punpcklwd m5, m11, m10
+ punpckhwd m11, m10
+ pmaddwd m10, m14, m5
+ pmaddwd m0, m14, m11
+ pmaddwd m5, m12
+ pmaddwd m11, m12
+ REPX {paddd x, m2}, m10, m0, m5, m11
+ REPX {psrad x, 12}, m10, m0, m5, m11
+ packssdw m10, m0 ; out10
+ packssdw m5, m11 ; -out5
+ punpcklwd m11, m8, m4
+ punpckhwd m8, m4
+ pmaddwd m4, m12, m11
+ pmaddwd m0, m12, m8
+ pmaddwd m11, m14
+ pmaddwd m8, m14
+ REPX {paddd x, m2}, m4, m0, m11, m8
+ REPX {psrad x, 12}, m4, m0, m11, m8
+ packssdw m4, m0 ; out4
+ packssdw m11, m8 ; -out11
+ punpcklwd m8, m9, m7
+ punpckhwd m9, m7
+ pmaddwd m7, m12, m8
+ pmaddwd m0, m12, m9
+ pmaddwd m8, m14
+ pmaddwd m9, m14
+ REPX {paddd x, m2}, m7, m0, m8, m9
+ REPX {psrad x, 12}, m7, m0, m8, m9
+ packssdw m7, m0 ; -out7
+ packssdw m8, m9 ; out8
+ punpckhwd m0, m6, m1
+ punpcklwd m6, m1
+ pmaddwd m1, m14, m0
+ pmaddwd m9, m14, m6
+ pmaddwd m0, m12
+ pmaddwd m6, m12
+ REPX {paddd x, m2}, m1, m9, m0, m6
+ REPX {psrad x, 12}, m1, m9, m0, m6
+ packssdw m9, m1 ; -out7
+ packssdw m6, m0 ; out8
+ vpbroadcastd m1, [o(pw_8192)]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+ ; 16-bit here will produce the same result as using 32-bit intermediates.
+ paddsw m5, m10, m11 ; -out5
+ psubsw m10, m11 ; out10
+ psubsw m11, m4, m8 ; -out11
+ paddsw m4, m8 ; out4
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; -out7
+ psubsw m9, m1, m6 ; -out9
+ paddsw m6, m1 ; out6
+ vpbroadcastd m1, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+ vpbroadcastd m1, [o(pw_2048)]
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass1_end
+ pmulhrsw m6, m1
+ pmulhrsw m2, m1, m8
+ mova [rsp+32*2], m6
+ pmulhrsw m6, m1, m4
+ pmulhrsw m4, m1, m10
+ pmulhrsw m8, m1, [cq+32*3]
+ pmulhrsw m10, m1, [cq+32*2]
+ pmulhrsw m12, m1, [cq+32*1]
+ pmulhrsw m14, m1, [cq+32*0]
+ pxor m0, m0
+ psubw m0, m1
+ REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
+ pmulhrsw m1, m0, m9
+ pmulhrsw m9, m0, m13
+ pmulhrsw m0, [rsp+32*1]
+ mova [rsp+16*0], xm15
+ mova [rsp+16*1], xm7
+ vperm2i128 m15, m15, m7, 0x31
+ vinserti128 m7, m2, xm14, 1
+ vperm2i128 m14, m2, m14, 0x31
+ vinserti128 m2, m9, xm5, 1
+ vperm2i128 m9, m9, m5, 0x31
+ vinserti128 m5, m4, xm12, 1
+ vperm2i128 m12, m4, m12, 0x31
+ vinserti128 m4, m11, xm3, 1
+ vperm2i128 m11, m11, m3, 0x31
+ vinserti128 m3, m10, xm6, 1
+ vperm2i128 m10, m10, m6, 0x31
+ vinserti128 m6, m1, xm0, 1
+ vperm2i128 m13, m1, m0, 0x31
+ vinserti128 m1, m8, [rsp+32*2], 1
+ vperm2i128 m8, m8, [rsp+32*2], 0x31
+ jmp m(idct_16x16_internal_8bpc).pass1_end3
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ pmulhrsw m0, m1
+ pmulhrsw m8, m1
+ mova [rsp+32*0], m0
+ mova [rsp+32*2], m8
+ pxor m0, m0
+ psubw m0, m1
+ pmulhrsw m8, m0, m7
+ pmulhrsw m7, m0, m9
+ pmulhrsw m9, m1, m6
+ pmulhrsw m6, m1, m10
+ pmulhrsw m10, m0, m5
+ pmulhrsw m5, m0, m11
+ pmulhrsw m11, m1, m4
+ pmulhrsw m4, m1, m12
+ pmulhrsw m12, m0, m3
+ pmulhrsw m3, m0, m13
+ pmulhrsw m13, m1, m2
+ pmulhrsw m1, m14
+ pmulhrsw m14, m0, [rsp+32*1]
+ pmulhrsw m0, m15
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1
+ mova m15, [rsp+32*0]
+ WRITE_16X2 3, 4, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3
+ jmp m(idct_16x16_internal_8bpc).end3
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2 ; signs are guaranteed to be equal
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova xm0, [cq+16* 0]
+ vinserti128 m0, [cq+16*16], 1
+ mova xm15, [cq+16* 1]
+ vinserti128 m15, [cq+16*17], 1
+ mova xm1, [cq+16* 2]
+ vinserti128 m1, [cq+16*18], 1
+ mova xm8, [cq+16* 3]
+ vinserti128 m8, [cq+16*19], 1
+ mova xm2, [cq+16* 4]
+ vinserti128 m2, [cq+16*20], 1
+ mova xm9, [cq+16* 5]
+ vinserti128 m9, [cq+16*21], 1
+ mova xm3, [cq+16* 6]
+ vinserti128 m3, [cq+16*22], 1
+ mova xm10, [cq+16* 7]
+ add cq, 16*16
+ vinserti128 m10, [cq+16* 7], 1
+ mova xm4, [cq-16* 8]
+ vinserti128 m4, [cq+16* 8], 1
+ mova xm11, [cq-16* 7]
+ vinserti128 m11, [cq+16* 9], 1
+ mova xm5, [cq-16* 6]
+ vinserti128 m5, [cq+16*10], 1
+ mova xm12, [cq-16* 5]
+ vinserti128 m12, [cq+16*11], 1
+ mova xm13, [cq-16* 3]
+ vinserti128 m13, [cq+16*13], 1
+ mova xm14, [cq-16* 1]
+ vinserti128 m14, [cq+16*15], 1
+ REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
+ 10, 4, 11, 5, 12, 13, 14
+ mova xm6, [cq-16* 4]
+ vinserti128 m6, [cq+16*12], 1
+ mova [rsp], m0
+ IDTX16B 6, 0, 7
+ mova xm0, [cq-16* 2]
+ vinserti128 m0, [cq+16*14], 1
+ pmulhrsw m7, m0
+ psraw m7, 1
+ pavgw m7, m0
+ jmp m(idct_16x16_internal_8bpc).pass1_end3
+ALIGN function_align
+.pass2:
+ vpbroadcastd m15, [o(pw_1697x16)]
+ mova [rsp+32*1], m0
+ REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14
+ mova m0, [rsp+32*1]
+ mova [rsp+32*1], m1
+ IDTX16 0, 1, 15
+ mova m1, [rsp+32*0]
+ pmulhrsw m15, m1
+ paddsw m1, m1
+ paddsw m15, m1
+ jmp m(idct_16x16_internal_8bpc).end
+
+%define o_base deint_shuf + 128
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [%1+%2*0]
+ pmulhrsw m1, m15, [%1+%2*1]
+ pmulhrsw m2, m15, [%1+%2*2]
+ pmulhrsw m3, m15, [%1+%2*3]
+ pmulhrsw m4, m15, [%1+%2*4]
+ pmulhrsw m5, m15, [%1+%2*5]
+ pmulhrsw m6, m15, [%1+%2*6]
+ pmulhrsw m7, m15, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
+%if %3
+%if %3 == 1
+ vpbroadcastd m15, [o(pw_2896x8)]
+%endif
+ pmulhrsw m8, m15, [%1+%2*0]
+ pmulhrsw m9, m15, [%1+%2*1]
+ pmulhrsw m10, m15, [%1+%2*2]
+ pmulhrsw m11, m15, [%1+%2*3]
+ pmulhrsw m12, m15, [%1+%2*4]
+ pmulhrsw m13, m15, [%1+%2*5]
+ pmulhrsw m14, m15, [%1+%2*6]
+ pmulhrsw m15, [%1+%2*7]
+%else
+ mova m8, [%1+%2*0]
+ mova m9, [%1+%2*1]
+ mova m10, [%1+%2*2]
+ mova m11, [%1+%2*3]
+ mova m12, [%1+%2*4]
+ mova m13, [%1+%2*5]
+ mova m14, [%1+%2*6]
+ mova m15, [%1+%2*7]
+%endif
+%endmacro
+
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+ vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
+ punpcklwd m%1, m%2, m%2
+ pmulhrsw m%1, m%3
+ vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
+ punpckhwd m%2, m%2
+ pmulhrsw m%2, m%3
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
+ %undef cmp
+ cmp eobd, 106
+ jle .fast
+ LOAD_8ROWS cq+32*1, 32*2
+ call m(idct_16x8_internal_8bpc).main
+ vperm2i128 m11, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ vperm2i128 m4, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ pxor m7, m7
+ REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpcklwd m3, m11, m4
+ punpckhwd m11, m4
+ punpckhwd m4, m5, m6
+ punpcklwd m5, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+ punpckhdq m5, m11, m4
+ punpckldq m11, m4
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+ punpckhqdq m12, m6, m0
+ punpcklqdq m0, m6 ; out4
+ punpckhqdq m13, m7, m4
+ punpcklqdq m4, m7 ; out5
+ punpckhqdq m14, m3, m2
+ punpcklqdq m2, m3 ; out6
+ punpckhqdq m15, m5, m11
+ punpcklqdq m11, m5 ; out7
+ mova [rsp+32*0], m0
+ mova [rsp+32*1], m4
+ mova [rsp+32*2], m2
+.fast:
+ LOAD_8ROWS cq+32*0, 32*2
+ call m(idct_16x8_internal_8bpc).main
+ vperm2i128 m8, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ vperm2i128 m4, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ vpbroadcastd m9, [o(pw_8192)]
+ pxor m7, m7
+ REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m8, m4
+ punpcklwd m8, m4
+ punpckhwd m4, m5, m6
+ punpcklwd m5, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m8, m5
+ punpckhdq m8, m5
+ punpckhdq m5, m3, m4
+ punpckldq m3, m4
+ punpckhdq m4, m7, m1
+ punpckldq m7, m1
+ punpcklqdq m1, m7, m4
+ punpckhqdq m7, m4 ; out9
+ punpckhqdq m4, m2, m8 ; out10
+ punpcklqdq m2, m8
+ punpckhqdq m8, m3, m5
+ punpcklqdq m3, m5
+ punpckhqdq m5, m0, m6 ; out8
+ punpcklqdq m0, m6
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
+ cmp eobd, 106
+ jg .full
+ mova [rsp+32*0], m5
+ mova [rsp+32*1], m7
+ mova [rsp+32*2], m4
+ pmulhrsw m11, m9, m8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call .main_fast
+ jmp .pass2
+.dconly:
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+.full:
+ REPX {pmulhrsw x, m9}, m12, m13, m14, m15
+ pmulhrsw m6, m9, [rsp+32*2]
+ mova [rsp+32*2], m4
+ pmulhrsw m4, m9, [rsp+32*0]
+ mova [rsp+32*0], m5
+ pmulhrsw m5, m9, [rsp+32*1]
+ mova [rsp+32*1], m7
+ pmulhrsw m7, m9, m11
+ pmulhrsw m11, m9, m8
+ call .main
+.pass2:
+ vpbroadcastd m12, [o(pw_2048)]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m13, m14, m15
+ pmulhrsw m12, [rsp]
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
+ mova [rsp+32*0], m4
+ mova [rsp+32*1], m6
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 [rsp+32*0], 5, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 [rsp+32*1], 7, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 8, 9, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 10, 11, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 12, 13, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 14, 15, 4, 6
+ RET
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ call m(idct_8x16_internal_8bpc).main
+ mova m8, [rsp+gprsize+0*32]
+ mova [rsp+gprsize+0*32], m0
+ mova m9, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+1*32], m1
+ mova m0, [rsp+gprsize+2*32]
+ mova [rsp+gprsize+2*32], m6
+ lea r5, [r6-(o_base)+pw_201_4091x8]
+ ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main
+ call m(idct_8x16_internal_8bpc).main
+ mova m8, [rsp+gprsize+0*32]
+ mova [rsp+gprsize+0*32], m0
+ mova m9, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+1*32], m1
+ mova m0, [rsp+gprsize+2*32]
+ mova [rsp+gprsize+2*32], m6
+ punpcklwd m1, m15, m8 ; in31 in1
+ punpckhwd m8, m15 ; in3 in29
+ punpcklwd m15, m14, m9 ; in27 in5
+ punpckhwd m9, m14 ; in7 in25
+ punpcklwd m14, m13, m0 ; in23 in9
+ punpckhwd m0, m13 ; in11 in21
+ punpcklwd m13, m12, m11 ; in19 in13
+ punpckhwd m11, m12 ; in15 in17
+ ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a
+ ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a
+ ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a
+ ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a
+ ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a
+ ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a
+ ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a
+ ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a
+.main2:
+ psubsw m6, m1, m11 ; t17 t30
+ paddsw m1, m11 ; t16 t31
+ psubsw m11, m9, m14 ; t18 t29
+ paddsw m9, m14 ; t19 t28
+ psubsw m14, m15, m0 ; t21 t26
+ paddsw m15, m0 ; t20 t27
+ psubsw m0, m8, m13 ; t22 t25
+ paddsw m8, m13 ; t23 t24
+ ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a
+ ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a
+ ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a
+ ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
+ psubsw m13, m1, m9 ; t19a t28a
+ paddsw m1, m9 ; t16a t31a
+ psubsw m9, m8, m15 ; t20a t27a
+ paddsw m8, m15 ; t23a t24a
+ psubsw m15, m6, m11 ; t18 t29
+ paddsw m6, m11 ; t17 t30
+ psubsw m11, m0, m14 ; t21 t26
+ paddsw m0, m14 ; t22 t25
+ ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a
+ ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28
+ ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27
+ ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
+ vbroadcasti128 m12, [o(deint_shuf)]
+ psubsw m14, m1, m8 ; t23 t24
+ paddsw m1, m8 ; t16 t31
+ psubsw m8, m6, m0 ; t22a t25a
+ paddsw m6, m0 ; t17a t30a
+ psubsw m0, m15, m11 ; t21 t26
+ paddsw m15, m11 ; t18 t29
+ psubsw m11, m13, m9 ; t20a t27a
+ paddsw m13, m9 ; t19a t28a
+ REPX {pshufb x, m12}, m1, m6, m15, m13
+ ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a
+ vpbroadcastd m9, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20
+ shufps m9, m14, m8, q1032 ; t23a t22
+ vpblendd m14, m8, 0xcc ; t24a t25
+ shufps m8, m11, m0, q1032 ; t20 t21a
+ vpblendd m11, m0, 0xcc ; t27 t26a
+ punpcklqdq m0, m1, m6 ; t16 t17a
+ punpckhqdq m1, m6 ; t31 t30a
+ psubsw m10, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m6, m3, m14 ; out24 out25
+ paddsw m3, m14 ; out7 out6
+ psubsw m8, m7, m0 ; out16 out17
+ paddsw m7, m0 ; out15 out14
+ mova m0, [rsp+gprsize+0*32]
+ punpcklqdq m12, m13, m15 ; t19a t18
+ punpckhqdq m13, m15 ; t28a t29
+ psubsw m15, m0, m1 ; out31 out30
+ paddsw m0, m1 ; out0 out1
+ mova m1, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+0*32], m6
+ mova m6, [rsp+gprsize+2*32]
+ psubsw m14, m1, m13 ; out28 out29
+ paddsw m1, m13 ; out3 out2
+ psubsw m13, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ psubsw m11, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m9, m6, m12 ; out19 out18
+ paddsw m6, m12 ; out12 out13
+ ret
+
+%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
+ vbroadcasti128 m%1, [cq+16*%3]
+ vbroadcasti128 m%2, [cq+16*%4]
+ shufpd m%1, m%2, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [pw_2048] ; intentionally rip-relative
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m3, m3
+.dconly_loop:
+ mova m1, [dstq]
+ punpckhbw m2, m1, m3
+ punpcklbw m1, m3
+ paddw m2, m0
+ paddw m1, m0
+ packuswb m1, m2
+ mova [dstq], m1
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
+ %undef cmp
+ LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ add cq, 16*16
+ LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10
+ LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11
+ LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14
+ LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
+ mova [rsp+32*0], m4
+ mova [rsp+32*1], m5
+ mova [rsp+32*2], m6
+ cmp eobd, 106
+ jg .full
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .pass2
+.full:
+ LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18
+ LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17
+ LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22
+ LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ add cq, 16*8
+ LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26
+ LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25
+ LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30
+ LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+.pass2:
+ vpbroadcastd m12, [o(pw_8192)]
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
+ mova [rsp+32*1], m9
+ mova [rsp+32*2], m10
+ punpckhwd m9, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m10, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpckhwd m3, m0, m9
+ punpcklwd m0, m9
+ punpckhwd m9, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m10, m4
+ punpckhwd m10, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m9
+ punpckhdq m3, m9
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m10, m5
+ punpckhdq m10, m5
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
+ pmulhrsw m12, [rsp+32*0]
+ mova [rsp+32*0], m8
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m9, 0x31
+ vinserti128 m2, xm9, 1
+ vperm2i128 m7, m3, m10, 0x31
+ vinserti128 m3, xm10, 1
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r2
+ lea r3, [dstq+strideq*4]
+ %define dstq r3
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r2
+ mova m0, [rsp+32*0]
+ mova m1, [rsp+32*1]
+ mova m2, [rsp+32*2]
+ punpckhwd m7, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m11
+ punpcklwd m1, m11
+ punpckhwd m4, m12, m14
+ punpcklwd m12, m14
+ punpckhwd m5, m13, m15
+ punpcklwd m13, m15
+ punpckhwd m3, m0, m7
+ punpcklwd m0, m7
+ punpckhwd m9, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m12, m4
+ punpckhwd m12, m4
+ punpcklwd m4, m5, m13
+ punpckhwd m5, m13
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m9
+ punpckhdq m3, m9
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m12, m5
+ punpckhdq m12, m5
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m9, 0x31
+ vinserti128 m2, xm9, 1
+ vperm2i128 m7, m3, m12, 0x31
+ vinserti128 m3, xm12, 1
+ call m(idct_16x8_internal_8bpc).main2
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ add r0, 16
+ add r3, 16
+ %define dstq r0
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r2
+ %define dstq r3
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r2
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
+.loop:
+ mova xm0,[cq+16* 0]
+ mova xm1, [cq+16* 4]
+ vinserti128 m0, [cq+16* 1], 1
+ vinserti128 m1, [cq+16* 5], 1
+ pxor m8, m8
+ mova [cq+32*0], m8
+ mova [cq+32*2], m8
+ add cq, 16*16
+ mova xm2, [cq-16* 8]
+ mova xm3, [cq-16* 4]
+ vinserti128 m2, [cq-16* 7], 1
+ vinserti128 m3, [cq-16* 3], 1
+ mova xm4, [cq+16* 0]
+ mova xm5, [cq+16* 4]
+ vinserti128 m4, [cq+16* 1], 1
+ vinserti128 m5, [cq+16* 5], 1
+ mova xm6, [cq+16* 8]
+ mova xm7, [cq+16*12]
+ vinserti128 m6, [cq+16* 9], 1
+ vinserti128 m7, [cq+16*13], 1
+ REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
+ REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose8x8
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ sub cq, 16*16-32
+ lea dstq, [dstq+r4*4]
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+ALIGN function_align
+.transpose8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob
+ add cq, 16*8
+ vpbroadcastd m9, [pw_4096]
+ lea r4, [strideq*3]
+ lea r5, [dstq+strideq*4]
+ sub eobd, 107
+.loop:
+ mova xm0, [cq-16*8]
+ mova xm1, [cq-16*7]
+ vinserti128 m0, [cq+16*0], 1
+ vinserti128 m1, [cq+16*1], 1
+ mova xm2, [cq-16*6]
+ mova xm3, [cq-16*5]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m3, [cq+16*3], 1
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm6, [cq-16*2]
+ mova xm7, [cq-16*1]
+ vinserti128 m6, [cq+16*6], 1
+ vinserti128 m7, [cq+16*7], 1
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r4
+ %define dstq r5
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r4
+ add cq, 16*16
+ add r0, 16
+ add r5, 16
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+%define o_base pw_5 + 128
+
+%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
+%if %3
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [%1+%2* 0]
+ pmulhrsw m1, m15, [%1+%2* 1]
+ pmulhrsw m2, m15, [%1+%2* 2]
+ pmulhrsw m3, m15, [%1+%2* 3]
+ pmulhrsw m4, m15, [%1+%2* 4]
+ pmulhrsw m5, m15, [%1+%2* 5]
+ pmulhrsw m6, m15, [%1+%2* 6]
+ pmulhrsw m7, m15, [%1+%2* 7]
+ pmulhrsw m8, m15, [%1+%2* 8]
+ pmulhrsw m9, m15, [%1+%2* 9]
+ pmulhrsw m10, m15, [%1+%2*10]
+ pmulhrsw m11, m15, [%1+%2*11]
+ pmulhrsw m12, m15, [%1+%2*12]
+ pmulhrsw m13, m15, [%1+%2*13]
+ pmulhrsw m14, m15, [%1+%2*14]
+ pmulhrsw m15, [%1+%2*15]
+%else
+ mova m0, [%1+%2* 0]
+ mova m1, [%1+%2* 1]
+ mova m2, [%1+%2* 2]
+ mova m3, [%1+%2* 3]
+ mova m4, [%1+%2* 4]
+ mova m5, [%1+%2* 5]
+ mova m6, [%1+%2* 6]
+ mova m7, [%1+%2* 7]
+ mova m8, [%1+%2* 8]
+ mova m9, [%1+%2* 9]
+ mova m10, [%1+%2*10]
+ mova m11, [%1+%2*11]
+ mova m12, [%1+%2*12]
+ mova m13, [%1+%2*13]
+ mova m14, [%1+%2*14]
+ mova m15, [%1+%2*15]
+%endif
+ mova [rsp], m15
+%if %4
+ pxor m15, m15
+ REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15
+%endif
+%endmacro
+
+%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
+ mova m%4, [%2]
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
+ pmovzxbw m%4, [dstq+%6]
+ pmulhrsw m%3, m%5
+ pmulhrsw m%1, m%5
+ paddw m%3, m%4
+ pmovzxbw m%4, [r2+%7]
+ paddw m%1, m%4
+ packuswb m%3, m%1
+ vpermq m%3, m%3, q3120
+ mova [dstq+%6], xm%3
+ vextracti128 [r2+%7], m%3, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3
+ %undef cmp
+ LOAD_16ROWS cq, 64, 1
+ call m(idct_16x16_internal_8bpc).main
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ lea tmp3q, [tmp1q+32*16]
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_16384)]
+ call .transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [tmp3q-32*4+ 0], xm0
+ vextracti128 [tmp3q+32*0+ 0], m0, 1
+ mova [tmp3q-32*3+ 0], xm2
+ vextracti128 [tmp3q+32*1+ 0], m2, 1
+ mova [tmp3q-32*2+ 0], xm4
+ vextracti128 [tmp3q+32*2+ 0], m4, 1
+ mova [tmp3q-32*1+ 0], xm6
+ vextracti128 [tmp3q+32*3+ 0], m6, 1
+ mova [tmp3q-32*4+16], xm8
+ vextracti128 [tmp3q+32*0+16], m8, 1
+ mova [tmp3q-32*3+16], xm10
+ vextracti128 [tmp3q+32*1+16], m10, 1
+ mova [tmp3q-32*2+16], xm12
+ vextracti128 [tmp3q+32*2+16], m12, 1
+ mova [tmp3q-32*1+16], xm14
+ vextracti128 [tmp3q+32*3+16], m14, 1
+ cmp eobd, 150
+ jg .full
+ vinserti128 m0, m1, xm9, 1
+ vperm2i128 m4, m1, m9, 0x31
+ vinserti128 m2, m5, xm13, 1
+ vperm2i128 m6, m5, m13, 0x31
+ vinserti128 m1, m3, xm11, 1
+ vperm2i128 m5, m3, m11, 0x31
+ vinserti128 m3, m7, xm15, 1
+ vperm2i128 m7, m7, m15, 0x31
+ call .main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp .idct16
+.dconly:
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.full:
+ mova [tmp1q-32*4], m1
+ mova [tmp1q-32*3], m3
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m7
+ mova [tmp1q+32*0], m9
+ mova [tmp1q+32*1], m11
+ mova [tmp1q+32*2], m13
+ mova [tmp1q+32*3], m15
+ LOAD_16ROWS cq+32, 64, 1
+ call m(idct_16x16_internal_8bpc).main
+ lea r2, [tmp3q+32*8]
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_16384)]
+ call .transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [r2-32*4+ 0], xm0
+ vextracti128 [r2+32*0+ 0], m0, 1
+ mova [r2-32*3+ 0], xm2
+ vextracti128 [r2+32*1+ 0], m2, 1
+ mova [r2-32*2+ 0], xm4
+ vextracti128 [r2+32*2+ 0], m4, 1
+ mova [r2-32*1+ 0], xm6
+ vextracti128 [r2+32*3+ 0], m6, 1
+ mova [r2-32*4+16], xm8
+ vextracti128 [r2+32*0+16], m8, 1
+ mova [r2-32*3+16], xm10
+ vextracti128 [r2+32*1+16], m10, 1
+ mova [r2-32*2+16], xm12
+ vextracti128 [r2+32*2+16], m12, 1
+ mova [r2-32*1+16], xm14
+ vextracti128 [r2+32*3+16], m14, 1
+ vinserti128 m8, m1, xm9, 1
+ vperm2i128 m12, m1, m9, 0x31
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp1q+32*0], 1
+ vinserti128 m1, [tmp1q+32*1], 1
+ vinserti128 m10, m5, xm13, 1
+ vperm2i128 m14, m5, m13, 0x31
+ mova xm4, [tmp1q-32*4+16]
+ mova xm5, [tmp1q-32*3+16]
+ vinserti128 m4, [tmp1q+32*0+16], 1
+ vinserti128 m5, [tmp1q+32*1+16], 1
+ vinserti128 m9, m3, xm11, 1
+ vperm2i128 m13, m3, m11, 0x31
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp1q+32*2], 1
+ vinserti128 m3, [tmp1q+32*3], 1
+ vinserti128 m11, m7, xm15, 1
+ vperm2i128 m15, m7, m15, 0x31
+ mova xm6, [tmp1q-32*2+16]
+ mova xm7, [tmp1q-32*1+16]
+ vinserti128 m6, [tmp1q+32*2+16], 1
+ vinserti128 m7, [tmp1q+32*3+16], 1
+ call .main_oddhalf
+ LOAD_8ROWS_H r2-32*4, 32
+.idct16:
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call .pass2_end
+ RET
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; lower half is zero
+ mova [rsp+gprsize+32*1], m7
+ pxor m7, m7
+ mova [rsp+gprsize+32*0], m7
+ mova [rsp+gprsize+32*2], m7
+ vpbroadcastd m11, [o(pw_3703x8)]
+ vpbroadcastd m7, [o(pw_1751x8)]
+ vpbroadcastd m12, [o(pw_m1380x8)]
+ vpbroadcastd m8, [o(pw_3857x8)]
+ vpbroadcastd m13, [o(pw_3973x8)]
+ vpbroadcastd m15, [o(pw_995x8)]
+ pmulhrsw m11, m4 ; t29a
+ pmulhrsw m4, m7 ; t18a
+ pmulhrsw m12, m3 ; t19a
+ pmulhrsw m3, m8 ; t28a
+ pmulhrsw m13, m2 ; t27a
+ pmulhrsw m2, m15 ; t20a
+ vpbroadcastd m10, [o(pw_m2106x8)]
+ vpbroadcastd m7, [o(pw_3513x8)]
+ vpbroadcastd m9, [o(pw_3290x8)]
+ vpbroadcastd m8, [o(pw_2440x8)]
+ vpbroadcastd m14, [o(pw_m601x8)]
+ vpbroadcastd m15, [o(pw_4052x8)]
+ pmulhrsw m10, m5 ; t21a
+ pmulhrsw m5, m7 ; t26a
+ pmulhrsw m9, m6 ; t25a
+ pmulhrsw m6, m8 ; t22a
+ pmulhrsw m14, m1 ; t23a
+ pmulhrsw m1, m15 ; t24a
+ vpbroadcastd m15, [o(pd_2048)]
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ mova [rsp+gprsize+32*0], m15
+ mova [rsp+gprsize+32*1], m7
+ mova [rsp+gprsize+32*2], m8
+ vpbroadcastd m15, [o(pd_2048)]
+ ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m7, m12, m4 ; t18
+ paddsw m12, m4 ; t19
+ psubsw m4, m2, m10 ; t21
+ paddsw m2, m10 ; t20
+ psubsw m10, m14, m6 ; t22
+ paddsw m14, m6 ; t23
+ psubsw m6, m1, m9 ; t25
+ paddsw m1, m9 ; t24
+ psubsw m9, m13, m5 ; t26
+ paddsw m13, m5 ; t27
+ psubsw m5, m3, m11 ; t29
+ paddsw m3, m11 ; t28
+ ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a
+ psubsw m8, m14, m2 ; t20a
+ paddsw m14, m2 ; t23a
+ psubsw m2, m1, m13 ; t27a
+ paddsw m1, m13 ; t24a
+ psubsw m13, m6, m9 ; t21
+ paddsw m6, m9 ; t22
+ psubsw m9, m10, m4 ; t26
+ paddsw m10, m4 ; t25
+ ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27
+ ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a
+ mova m4, [rsp+gprsize+32*0] ; in31
+ mova [rsp+gprsize+32*0], m6 ; t22
+ mova m6, [rsp+gprsize+32*1] ; in15
+ mova [rsp+gprsize+32*1], m14 ; t23a
+ mova m14, [rsp+gprsize+32*2] ; in17
+ mova [rsp+gprsize+32*2], m1 ; t24a
+ ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a
+ psubsw m1, m0, m14 ; t17
+ paddsw m0, m14 ; t16
+ psubsw m14, m4, m6 ; t30
+ paddsw m4, m6 ; t31
+ ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a
+ psubsw m6, m0, m12 ; t19a
+ paddsw m0, m12 ; t16a
+ psubsw m12, m4, m3 ; t28a
+ paddsw m4, m3 ; t31a
+ psubsw m3, m14, m5 ; t18
+ paddsw m14, m5 ; t17
+ psubsw m5, m1, m7 ; t29
+ paddsw m1, m7 ; t30
+ ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28
+ psubsw m7, m1, m10 ; t25a
+ paddsw m1, m10 ; t30a
+ psubsw m10, m5, m9 ; t21
+ paddsw m5, m9 ; t18
+ psubsw m9, m12, m2 ; t20a
+ paddsw m12, m2 ; t19a
+ psubsw m2, m3, m13 ; t26
+ paddsw m3, m13 ; t29
+ psubsw m13, m6, m8 ; t27a
+ paddsw m6, m8 ; t28a
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m12
+ mova [tmp2q+32*0], m6
+ mova [tmp2q+32*1], m3
+ mova [tmp2q+32*2], m1
+ mova m5, [rsp+gprsize+32*0] ; t22
+ mova m6, [rsp+gprsize+32*1] ; t23
+ mova m3, [rsp+gprsize+32*2] ; t24a
+ psubsw m1, m14, m5 ; t22a
+ paddsw m14, m5 ; t17a
+ psubsw m5, m0, m6 ; t23
+ paddsw m0, m6 ; t16
+ psubsw m6, m4, m3 ; t24
+ paddsw m4, m3 ; t31
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m3, [o(pw_2896_2896)]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m14
+ mova [tmp2q+32*3], m4
+ ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27
+ ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a
+ ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25
+ ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a
+ mova [tmp1q+32*0], m13
+ mova [tmp1q+32*1], m2
+ mova [tmp1q+32*2], m7
+ mova [tmp1q+32*3], m6
+ mova [tmp2q-32*4], m5
+ mova [tmp2q-32*3], m1
+ mova [tmp2q-32*2], m10
+ mova [tmp2q-32*1], m9
+ ret
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m6, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m6, m9
+ punpckhdq m6, m9
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m6
+ punpcklqdq m14, m6
+ pmulhrsw m6, m7, [rsp+gprsize+32*0]
+ REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
+ pmulhrsw m7, [rsp+gprsize+32*1]
+ mova [rsp+gprsize+32*0], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ ret
+ALIGN function_align
+.pass2_end:
+ mova [rsp+gprsize+32*0], m7
+ mova [rsp+gprsize+32*2], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*1]
+ IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m7, [rsp+gprsize+32*0]
+ mova m1, [rsp+gprsize+32*2]
+ IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0
+ ret
+
+; Perform the final sumsub step and YMM lane shuffling
+%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
+ mova m%3, [tmp2q+32*( 3-%1)]
+ psubsw m%4, m%1, m%3
+ paddsw m%1, m%3
+ mova m%3, [tmp1q+32*(11-%2)]
+ mova [tmp1q+32*(11-%2)+16], xm%4
+ vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
+ paddsw m%4, m%2, m%3
+ psubsw m%2, m%3
+ mova [tmp1q+32*(11-%2)], xm%2
+ vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
+ vperm2i128 m%2, m%1, m%4, 0x31
+ vinserti128 m%1, xm%4, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [cq+32* 1]
+ pmulhrsw m1, m15, [cq+32* 3]
+ pmulhrsw m2, m15, [cq+32* 5]
+ pmulhrsw m3, m15, [cq+32* 7]
+ pmulhrsw m4, m15, [cq+32* 9]
+ pmulhrsw m5, m15, [cq+32*11]
+ pmulhrsw m6, m15, [cq+32*13]
+ pmulhrsw m7, m15, [cq+32*15]
+ pmulhrsw m8, m15, [cq+32*17]
+ pmulhrsw m9, m15, [cq+32*19]
+ pmulhrsw m10, m15, [cq+32*21]
+ pmulhrsw m11, m15, [cq+32*23]
+ pmulhrsw m12, m15, [cq+32*25]
+ pmulhrsw m13, m15, [cq+32*27]
+ pmulhrsw m14, m15, [cq+32*29]
+ pmulhrsw m15, [cq+32*31]
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_16ROWS cq+32*0, 32*2, 1, 0
+ pxor m15, m15
+ mov r3d, 8
+.zero_loop:
+ mova [cq+32*0], m15
+ mova [cq+32*1], m15
+ mova [cq+32*2], m15
+ mova [cq+32*3], m15
+ add cq, 32*4
+ dec r3d
+ jg .zero_loop
+ call m(idct_16x16_internal_8bpc).main
+ call .pass1_end
+ lea r2, [strideq*3]
+ mov r3, dstq
+.pass2:
+ vpbroadcastd m7, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ call m(idct_16x16_internal_8bpc).main
+ mova [rsp+32*2], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ REPX {pmulhrsw x, m15}, m2, m3, m0
+ WRITE_16X2 2, 3, 1, 2, strideq*2, r2
+ pmulhrsw m1, m15, [rsp+32*1]
+ WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m4, m5, m6, m7
+ WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 2, 3, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m8, m9, m10, m11
+ WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 2, 3, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m11, m12, m13, m14
+ pmulhrsw m15, [rsp+32*2]
+ WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 2, 3, strideq*2, r2
+ test r3, r3
+ jnz .right_half
+ RET
+.right_half:
+ LOAD_8ROWS tmp1q-32*4, 32
+ LOAD_8ROWS_H tmp2q-32*4, 32
+ lea dstq, [r3+16]
+ xor r3d, r3d
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ jmp .pass2
+ALIGN function_align
+.pass1_end:
+ mova [rsp+gprsize+32*0], m9
+ IDCT32_PASS1_END 0, 8, 1, 9
+ IDCT32_PASS1_END 2, 10, 1, 9
+ IDCT32_PASS1_END 3, 11, 1, 9
+ IDCT32_PASS1_END 4, 12, 1, 9
+ IDCT32_PASS1_END 5, 13, 1, 9
+ IDCT32_PASS1_END 6, 14, 1, 9
+ IDCT32_PASS1_END 7, 15, 1, 9
+ mova m1, [rsp+gprsize+32*1]
+ mova m9, [rsp+gprsize+32*0]
+ mova [rsp+gprsize+32*0], m6
+ mova [rsp+gprsize+32*1], m7
+ IDCT32_PASS1_END 1, 9, 6, 7
+ ret
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m12, [o(pw_8192)]
+ cmp eobd, 43 ; if (eob > 43)
+ setg r4b ; iteration_count++
+ cmp eobd, 150 ; if (eob > 150)
+ setg al ; iteration_count++
+ add eobd, -279 ; if (eob > 278)
+ adc r4b, al ; iteration_count++
+ lea r3, [strideq*3]
+ mov r6, cq
+ paddw m11, m12, m12 ; pw_16384
+.loop:
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
+ REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ dec r4b
+ jge .loop
+ sub cq, 32
+ pxor m0, m0
+ mov r0d, 8
+ cmp cq, r6
+ ja .zero_loop
+.zero_loop_half:
+ mova [r6+64*0], m0
+ mova [r6+64*1], m0
+ add r6, 64*4
+ mova [r6-64*2], m0
+ mova [r6-64*1], m0
+ sub r0d, 2
+ jg .zero_loop_half
+ RET
+.zero_loop:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ mova [r6+32*3], m0
+ add r6, 32*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m11, [o(pw_2048)]
+ cmp eobd, 35 ; if (eob > 35)
+ setg r4b ; iteration_count++
+ cmp eobd, 150 ; if (eob > 150)
+ setg r3b ; iteration_count += 2
+ lea r4d, [r4+r3*2]
+ lea r3, [strideq*3]
+ mov r5, dstq
+ mov r6, cq
+.loop:
+ mova xm0, [cq+32* 0]
+ mova xm1, [cq+32* 1]
+ vinserti128 m0, [cq+32* 8], 1
+ vinserti128 m1, [cq+32* 9], 1
+ mova xm2, [cq+32* 2]
+ mova xm3, [cq+32* 3]
+ vinserti128 m2, [cq+32*10], 1
+ vinserti128 m3, [cq+32*11], 1
+ mova xm4, [cq+32* 4]
+ mova xm5, [cq+32* 5]
+ vinserti128 m4, [cq+32*12], 1
+ vinserti128 m5, [cq+32*13], 1
+ mova xm6, [cq+32* 6]
+ mova xm7, [cq+32* 7]
+ vinserti128 m6, [cq+32*14], 1
+ vinserti128 m7, [cq+32*15], 1
+ REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ dec r4b
+ jl .ret
+ test r4b, 1
+ jz .loop
+ add cq, 32*15
+ lea dstq, [r5+16]
+ jmp .loop
+.ret:
+ sub cd, eax
+ pxor m0, m0
+ add cd, 384
+.zero_loop:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ mova [r6+32*3], m0
+ add r6, 32*4
+ sub cd, 128
+ jge .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3, tmp4
+ %undef cmp
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ sub eobd, 136
+ mov tmp4d, eobd
+.pass1_loop:
+ LOAD_8ROWS cq+64*1, 64*2
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+ test tmp4d, tmp4d
+ jl .fast
+ LOAD_8ROWS_H cq+64*17, 64*2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_8ROWS_H cq+64*16, 64*2
+ pxor m0, m0
+ REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp], m15
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ LOAD_8ROWS cq+64*0, 64*2
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ lea tmp3q, [tmp1q+32*32]
+ mova m15, [rsp]
+ mova [tmp3q-32*4], m0
+ mova [tmp3q-32*3], m2
+ mova [tmp3q-32*2], m4
+ mova [tmp3q-32*1], m6
+ mova [tmp3q+32*0], m8
+ mova [tmp3q+32*1], m10
+ mova [tmp3q+32*2], m12
+ mova [tmp3q+32*3], m14
+ add tmp3q, 32*8
+ mova [tmp3q-32*4], m1
+ mova [tmp3q-32*3], m3
+ mova [tmp3q-32*2], m5
+ mova [tmp3q-32*1], m7
+ mova [tmp3q+32*0], m9
+ mova [tmp3q+32*1], m11
+ mova [tmp3q+32*2], m13
+ mova [tmp3q+32*3], m15
+ vpbroadcastd m9, [o(pw_8192)]
+ pmulhrsw m0, m9, [tmp1q-32*4]
+ pmulhrsw m1, m9, [tmp1q-32*3]
+ pmulhrsw m2, m9, [tmp1q-32*2]
+ pmulhrsw m3, m9, [tmp1q-32*1]
+ pmulhrsw m4, m9, [tmp1q+32*0]
+ pmulhrsw m5, m9, [tmp1q+32*1]
+ pmulhrsw m6, m9, [tmp1q+32*2]
+ pmulhrsw m7, m9, [tmp1q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q-32*4], m0
+ pmulhrsw m0, m9, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ pmulhrsw m1, m9, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ pmulhrsw m2, m9, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ pmulhrsw m3, m9, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ pmulhrsw m4, m9, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ pmulhrsw m5, m9, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ pmulhrsw m6, m9, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ pmulhrsw m7, m9, [tmp2q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add eobd, 0x80000000
+ jnc .pass1_loop
+ add tmp1q, 32*24
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ test tmp4d, tmp4d
+ jge .pass2_loop
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add tmp3q, 32*16
+.pass2_loop:
+ LOAD_8ROWS tmp2q-32*4, 32
+ test tmp4d, tmp4d
+ jl .fast2
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ sub tmp3q, 32*8
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ sub tmp3q, 32*16
+ jmp .pass2_loop_end
+.fast2:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ sub tmp3q, 32*24
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+.pass2_loop_end:
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
+ lea tmp3q, [tmp1q-32*32]
+ cmp tmp2q, tmp3q
+ jb .ret
+ sub tmp2q, 32*32
+ sub dstq, r3
+ lea r2, [r2+r3+16]
+ add dstq, 16
+ jmp .pass2_loop
+.ret:
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob
+ %undef cmp
+ vpbroadcastd m9, [pw_8192]
+ sub eobd, 136 ; if (eob < 136)
+ shr eobd, 30 ; topleft 16x16 only
+ lea eobd, [eobq*2-8]
+ lea r4, [strideq*3]
+ mov r5, dstq
+ lea r6, [cq+32]
+.loop:
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r4
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r4
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ inc eobd
+ jz .ret
+ test eobd, 3
+ jnz .loop
+ add cq, 64*15
+ lea dstq, [r5+16]
+ jmp .loop
+.ret:
+ pxor m0, m0
+ mov r0d, 16
+ cmp cq, r6
+ jne .zero_loop
+.zero_loop_topleft:
+ mova [r6-32*1], m0
+ mova [r6+32*1], m0
+ mova [r6+32*3], m0
+ mova [r6+32*5], m0
+ add r6, 64*4
+ sub r0d, 4
+ jg .zero_loop_topleft
+ RET
+.zero_loop:
+ mova [r6-32*1], m0
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ add r6, 32*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+ mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
+ mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
+%else
+ mova m%5, [tmp1q-32*(45-%1)]
+ mova m%4, [tmp2q-32*(20+%1)]
+%endif
+ psubsw m%6, m%5, m%4 ; idct32 out31-n
+ paddsw m%5, m%4 ; idct32 out 0+n
+ psubsw m%4, m%6, m%3 ; out32+n
+ paddsw m%6, m%3 ; out31-n
+ psubsw m%3, m%5, m%2 ; out63-n
+ paddsw m%5, m%2 ; out 0+n
+%if %0 == 6 ; pass 1
+%if %1 & 1
+ mova [tmp2q-32*(19-%1)], m%4
+ mova [tmp1q-32*(14+%1)], m%6
+ mova [tmp1q+32*(18-%1)], m%3
+ mova [tmp2q-32*(51-%1)], m%5
+%else
+ mova [tmp1q-32*(13-%1)], m%4
+ mova [tmp2q-32*(20+%1)], m%6
+ mova [tmp2q+32*(12-%1)], m%3
+ mova [tmp1q-32*(45-%1)], m%5
+%endif
+%else ; pass 2
+ REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
+%if %1 & 1
+ %define %%d0 r2
+ %define %%d1 dstq
+%else
+ %define %%d0 dstq
+ %define %%d1 r2
+%endif
+ pmovzxbw m%2, [%%d0+%9 ]
+ paddw m%2, m%4
+ pmovzxbw m%4, [%%d1+%8 ]
+ paddw m%4, m%6
+ pmovzxbw m%6, [%%d1+%10]
+ paddw m%3, m%6
+ pmovzxbw m%6, [%%d0+%7 ]
+ paddw m%5, m%6
+ packuswb m%2, m%4
+ packuswb m%3, m%5
+ vpermq m%2, m%2, q3120
+ vpermq m%3, m%3, q3120
+ mova [%%d0+%9 ], xm%2
+ vextracti128 [%%d1+%8 ], m%2, 1
+ mova [%%d1+%10], xm%3
+ vextracti128 [%%d0+%7 ], m%3, 1
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.normal:
+ PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+ %undef cmp
+ lea tmp1q, [rsp+32*23]
+ lea tmp2q, [tmp1q+32*24]
+ sub eobd, 151
+ mov r7d, eobd
+.pass1_loop:
+ LOAD_16ROWS cq, 64
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m2
+ mova [tmp1q-32*2], m4
+ mova [tmp1q-32*1], m6
+ mova [tmp1q+32*0], m8
+ mova [tmp1q+32*1], m10
+ mova [tmp1q+32*2], m12
+ mova [tmp1q+32*3], m14
+ mova [tmp2q-32*4], m1
+ mova [tmp2q-32*3], m3
+ mova [tmp2q-32*2], m5
+ mova [tmp2q-32*1], m7
+ mova [tmp2q+32*0], m9
+ mova [tmp2q+32*1], m11
+ mova [tmp2q+32*2], m13
+ mova [tmp2q+32*3], m15
+ add cq, 32
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ add eobd, 0x80000000
+ jnc .pass1_loop
+ lea r2, [rsp+32*23]
+ mova xm0, [r2-32*4+ 0]
+ mova xm1, [r2-32*2+ 0]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m1, [r2+32*2+ 0], 1
+ mova xm2, [r2-32*4+16]
+ mova xm3, [r2-32*2+16]
+ vinserti128 m2, [r2+32*0+16], 1
+ vinserti128 m3, [r2+32*2+16], 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ test r7d, r7d
+ jl .fast
+ lea r3, [r2+32*8]
+ mova xm4, [r3-32*4+ 0]
+ mova xm5, [r3-32*2+ 0]
+ vinserti128 m4, [r3+32*0+ 0], 1
+ vinserti128 m5, [r3+32*2+ 0], 1
+ mova xm6, [r3-32*4+16]
+ mova xm7, [r3-32*2+16]
+ vinserti128 m6, [r3+32*0+16], 1
+ vinserti128 m7, [r3+32*2+16], 1
+.fast:
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*7]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova xm0, [r2-32*3+ 0]
+ mova xm1, [r2-32*1+ 0]
+ vinserti128 m0, [r2+32*1+ 0], 1
+ vinserti128 m1, [r2+32*3+ 0], 1
+ mova xm2, [r2-32*3+16]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m2, [r2+32*1+16], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r7d, r7d
+ jl .fast2
+ mova xm4, [r3-32*3+ 0]
+ mova xm5, [r3-32*1+ 0]
+ vinserti128 m4, [r3+32*1+ 0], 1
+ vinserti128 m5, [r3+32*3+ 0], 1
+ mova xm6, [r3-32*3+16]
+ mova xm7, [r3-32*1+16]
+ vinserti128 m6, [r3+32*1+16], 1
+ vinserti128 m7, [r3+32*3+16], 1
+.fast2:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ add r2, 32*24
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova xm0, [r2-32*4+ 0]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ mova xm4, [r2-32*4+16]
+ mova xm7, [r2-32*1+ 0]
+ vinserti128 m4, [r2+32*0+16], 1
+ vinserti128 m7, [r2+32*3+ 0], 1
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r7d, r7d
+ jl .fast3
+ add r3, 32*24
+ mova xm1, [r3-32*1+16]
+ mova xm2, [r3-32*4+ 0]
+ vinserti128 m1, [r3+32*3+16], 1
+ vinserti128 m2, [r3+32*0+ 0], 1
+ mova xm5, [r3-32*1+ 0]
+ mova xm6, [r3-32*4+16]
+ vinserti128 m5, [r3+32*3+ 0], 1
+ vinserti128 m6, [r3+32*0+16], 1
+.fast3:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova xm0, [r2-32*2+ 0]
+ mova xm3, [r2-32*3+16]
+ vinserti128 m0, [r2+32*2+ 0], 1
+ vinserti128 m3, [r2+32*1+16], 1
+ mova xm4, [r2-32*2+16]
+ mova xm7, [r2-32*3+ 0]
+ vinserti128 m4, [r2+32*2+16], 1
+ vinserti128 m7, [r2+32*1+ 0], 1
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r7d, r7d
+ jl .fast4
+ mova xm1, [r3-32*3+16]
+ mova xm2, [r3-32*2+ 0]
+ vinserti128 m1, [r3+32*1+16], 1
+ vinserti128 m2, [r3+32*2+ 0], 1
+ mova xm5, [r3-32*3+ 0]
+ mova xm6, [r3-32*2+16]
+ vinserti128 m5, [r3+32*1+ 0], 1
+ vinserti128 m6, [r3+32*2+16], 1
+.fast4:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ RET
+ALIGN function_align
+%define o_base idct64_mul - 8
+cglobal_label .main_part1
+ ; idct64 steps 1-5:
+ ; in1/31/17/15/ 9/23/25/ 7 ->
+ ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
+ ; in5/27/21/11/13/19/29/ 3 ->
+ ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
+ vpbroadcastd m11, [o(idct64_mul+4* 0)]
+ vpbroadcastd m13, [o(idct64_mul+4* 1)]
+ vpbroadcastd m10, [o(idct64_mul+4* 4)]
+ vpbroadcastd m12, [o(idct64_mul+4* 5)]
+ pmulhrsw m11, m0 ; t63a
+ pmulhrsw m0, m13 ; t32a
+ pmulhrsw m10, m1 ; t62a
+ pmulhrsw m1, m12 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4* 8)]
+ vpbroadcastd m13, [o(idct64_mul+4* 9)]
+ vpbroadcastd m8, [o(idct64_mul+4*12)]
+ vpbroadcastd m12, [o(idct64_mul+4*13)]
+ pmulhrsw m9, m2 ; t61a
+ pmulhrsw m2, m13 ; t34a
+ pmulhrsw m8, m3 ; t60a
+ pmulhrsw m3, m12 ; t35a
+ psubsw m12, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m8, m9 ; t61
+ paddsw m8, m9 ; t60
+ psubsw m9, m11, m10 ; t62
+ paddsw m11, m10 ; t63
+ ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a
+ vpbroadcastd m14, [o(pw_401_4076)]
+ ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
+ psubsw m10, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m11, m8 ; t60a
+ paddsw m11, m8 ; t63a
+ psubsw m8, m9, m2 ; t34
+ paddsw m9, m2 ; t33
+ psubsw m2, m12, m1 ; t61
+ paddsw m12, m1 ; t62
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m9
+ mova [tmp2q+32*2], m12
+ mova [tmp2q+32*3], m11
+ vpbroadcastd m13, [o(pw_m4017_799)]
+ vpbroadcastd m14, [o(pw_799_4017)]
+ ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a
+ ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp2q+32*0], m10
+ mova [tmp2q+32*1], m8
+ vpbroadcastd m3, [o(idct64_mul+4*16)]
+ vpbroadcastd m11, [o(idct64_mul+4*17)]
+ vpbroadcastd m2, [o(idct64_mul+4*20)]
+ vpbroadcastd m10, [o(idct64_mul+4*21)]
+ vpbroadcastd m1, [o(idct64_mul+4*24)]
+ vpbroadcastd m9, [o(idct64_mul+4*25)]
+ vpbroadcastd m0, [o(idct64_mul+4*28)]
+ vpbroadcastd m8, [o(idct64_mul+4*29)]
+ pmulhrsw m3, m4 ; t59a
+ pmulhrsw m4, m11 ; t36a
+ pmulhrsw m2, m5 ; t58a
+ pmulhrsw m5, m10 ; t37a
+ pmulhrsw m1, m6 ; t57a
+ pmulhrsw m6, m9 ; t38a
+ pmulhrsw m0, m7 ; t56a
+ pmulhrsw m7, m8 ; t39a
+ psubsw m8, m4, m5 ; t37
+ paddsw m4, m5 ; t36
+ psubsw m5, m7, m6 ; t38
+ paddsw m7, m6 ; t39
+ psubsw m6, m0, m1 ; t57
+ paddsw m0, m1 ; t56
+ psubsw m1, m3, m2 ; t58
+ paddsw m3, m2 ; t59
+ ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a
+ vpbroadcastd m10, [o(pw_3166_2598)]
+ ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a
+ psubsw m2, m7, m4 ; t36a
+ paddsw m7, m4 ; t39a
+ psubsw m4, m0, m3 ; t59a
+ paddsw m0, m3 ; t56a
+ psubsw m3, m6, m1 ; t37
+ paddsw m6, m1 ; t38
+ psubsw m1, m5, m8 ; t58
+ paddsw m5, m8 ; t57
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ mova [tmp2q-32*4], m0
+ mova [tmp2q-32*3], m5
+ vpbroadcastd m6, [o(pw_m799_m4017)]
+ vpbroadcastd m7, [o(pw_m4017_799)]
+ ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59
+ ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m1
+ mova [tmp2q-32*2], m3
+ mova [tmp2q-32*1], m2
+ ret
+%define o_base pw_5 + 128
+.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
+ sub r6, o_idct64_offset + 8
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m14, [o(pw_m2896_2896)]
+.main_part2_pass1_loop:
+ call .main_part2_internal
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7
+ cmp tmp1q, tmp2q
+ jne .main_part2_pass1_loop
+ ret
+cglobal_label .main_part2_internal
+ mova m0, [tmp1q-32*12] ; t32a
+ mova m6, [tmp2q-32*13] ; t39a
+ mova m1, [tmp1q-32* 4] ; t40a
+ mova m5, [tmp2q+32* 3] ; t55a
+ add tmp1q, 32
+ sub tmp2q, 32
+ mova m2, [tmp1q+32* 3] ; t48a
+ mova m4, [tmp2q-32* 4] ; t47a
+ mova m3, [tmp1q+32*11] ; t56a
+ mova m7, [tmp2q+32*12] ; t63a
+ psubsw m8, m0, m6 ; t39
+ paddsw m0, m6 ; t32
+ psubsw m6, m4, m1 ; t40
+ paddsw m4, m1 ; t47
+ psubsw m1, m2, m5 ; t55
+ paddsw m2, m5 ; t48
+ psubsw m5, m7, m3 ; t56
+ paddsw m7, m3 ; t63
+ ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a
+ vpbroadcastd m9, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a
+ psubsw m3, m0, m4 ; t47a
+ paddsw m0, m4 ; t32a
+ psubsw m4, m7, m2 ; t48a
+ paddsw m7, m2 ; t63a
+ psubsw m2, m5, m1 ; t40
+ paddsw m5, m1 ; t39
+ psubsw m1, m8, m6 ; t55
+ paddsw m8, m6 ; t56
+ ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48
+ ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a
+ ret
+.main_part2_pass2:
+ sub r6, o_idct64_offset + 8
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ lea r9, [strideq*5] ; stride*5
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+ lea r8, [r3+strideq*2] ; stride*8
+ lea r2, [dstq+r7]
+.main_part2_pass2_loop:
+ vpbroadcastd m14, [o(pw_m2896_2896)]
+ call .main_part2_internal
+ vpbroadcastd m14, [o(pw_2048)]
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
+ add dstq, strideq
+ sub r2, strideq
+ cmp tmp1q, tmp2q
+ jne .main_part2_pass2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 16
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m1, m1
+.dconly_loop:
+ mova m2, [dstq+32*0]
+ mova m3, [dstq+32*1]
+ punpckhbw m4, m2, m1
+ punpcklbw m2, m1
+ punpckhbw m5, m3, m1
+ punpcklbw m3, m1
+ paddw m4, m0
+ paddw m2, m0
+ paddw m5, m0
+ paddw m3, m0
+ packuswb m2, m4
+ packuswb m3, m5
+ mova [dstq+32*0], m2
+ mova [dstq+32*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+ LOAD_8ROWS cq+32*0, 32*4
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*7]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+32*2, 32*4
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [cq+32* 1]
+ mova m1, [cq+32*31]
+ mova m2, [cq+32*17]
+ mova m3, [cq+32*15]
+ mova m4, [cq+32* 9]
+ mova m5, [cq+32*23]
+ mova m6, [cq+32*25]
+ mova m7, [cq+32* 7]
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [cq+32* 5]
+ mova m1, [cq+32*27]
+ mova m2, [cq+32*21]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*13]
+ mova m5, [cq+32*19]
+ mova m6, [cq+32*29]
+ mova m7, [cq+32* 3]
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*36
+ lea r2, [strideq*3]
+ mov tmp2d, 4
+.pass2_loop:
+ lea r3, [tmp1q-32*8]
+ mova xm0, [r3 -32*4]
+ mova xm1, [r3 -32*3]
+ vinserti128 m0, [tmp1q-32*4], 1
+ vinserti128 m1, [tmp1q-32*3], 1
+ mova xm2, [r3 -32*2]
+ mova xm3, [r3 -32*1]
+ vinserti128 m2, [tmp1q-32*2], 1
+ vinserti128 m3, [tmp1q-32*1], 1
+ mova xm4, [r3 +32*0]
+ mova xm5, [r3 +32*1]
+ vinserti128 m4, [tmp1q+32*0], 1
+ vinserti128 m5, [tmp1q+32*1], 1
+ mova xm6, [r3 +32*2]
+ mova xm7, [r3 +32*3]
+ vinserti128 m6, [tmp1q+32*2], 1
+ vinserti128 m7, [tmp1q+32*3], 1
+ mova xm8, [r3 -32*4+16]
+ mova xm9, [r3 -32*3+16]
+ vinserti128 m8, [tmp1q-32*4+16], 1
+ vinserti128 m9, [tmp1q-32*3+16], 1
+ mova xm10, [r3 -32*2+16]
+ mova xm11, [r3 -32*1+16]
+ vinserti128 m10, [tmp1q-32*2+16], 1
+ vinserti128 m11, [tmp1q-32*1+16], 1
+ mova xm12, [r3 +32*0+16]
+ mova xm13, [r3 +32*1+16]
+ vinserti128 m12, [tmp1q+32*0+16], 1
+ vinserti128 m13, [tmp1q+32*1+16], 1
+ mova xm14, [r3 +32*2+16]
+ mova xm15, [r3 +32*3+16]
+ vinserti128 m14, [tmp1q+32*2+16], 1
+ vinserti128 m15, [tmp1q+32*3+16], 1
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ call m(idct_16x16_internal_8bpc).main
+ mova [rsp+32*0], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 2, 3, 1, 2, strideq*2, r2
+ pmulhrsw m1, m15, [rsp+32*1]
+ WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
+ lea r3, [dstq+strideq*4]
+ %define dstq r3
+ WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 2, 3, strideq*2, r2
+ REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
+ lea r3, [r3+strideq*4]
+ WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 2, 3, strideq*2, r2
+ pmulhrsw m15, [rsp+32*0]
+ lea r3, [r3+strideq*4]
+ WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 2, 3, strideq*2, r2
+ add tmp1q, 32*16
+ add r0, 16
+ dec tmp2d
+ jg .pass2_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
+ lea tmp1q, [rsp+32*7]
+ lea r10d, [eobq-136]
+ sar r10d, 31
+.pass1_loop:
+ lea tmp2q, [tmp1q+32*16]
+ LOAD_8ROWS cq+64*1, 64*2, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+ test r10b, r10b
+ jnz .fast
+ LOAD_8ROWS_H cq+64*17, 64*2, 2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_8ROWS_H cq+64*16, 64*2, 1
+ mova [rsp], m15
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ LOAD_8ROWS cq+64*0, 64*2, 1
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
+ vpbroadcastd m7, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ lea r3, [tmp1q+32*48]
+ mova m15, [rsp]
+ mova [r3-32*4], m0
+ mova [r3-32*3], m2
+ mova [r3-32*2], m4
+ mova [r3-32*1], m6
+ mova [r3+32*0], m8
+ mova [r3+32*1], m10
+ mova [r3+32*2], m12
+ mova [r3+32*3], m14
+ add r3, 32*24
+ mova [r3-32*4], m1
+ mova [r3-32*3], m3
+ mova [r3-32*2], m5
+ mova [r3-32*1], m7
+ mova [r3+32*0], m9
+ mova [r3+32*1], m11
+ mova [r3+32*2], m13
+ mova [r3+32*3], m15
+ vpbroadcastd m9, [o(pw_16384)]
+ pmulhrsw m0, m9, [tmp1q-32*4]
+ pmulhrsw m1, m9, [tmp1q-32*3]
+ pmulhrsw m2, m9, [tmp1q-32*2]
+ pmulhrsw m3, m9, [tmp1q-32*1]
+ pmulhrsw m4, m9, [tmp1q+32*0]
+ pmulhrsw m5, m9, [tmp1q+32*1]
+ pmulhrsw m6, m9, [tmp1q+32*2]
+ pmulhrsw m7, m9, [tmp1q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q-32*4], m0
+ pmulhrsw m0, m9, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ pmulhrsw m1, m9, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ pmulhrsw m2, m9, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ pmulhrsw m3, m9, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ pmulhrsw m4, m9, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ pmulhrsw m5, m9, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ pmulhrsw m6, m9, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ pmulhrsw m7, m9, [tmp2q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*8
+ add r10d, 0x80000000
+ jnc .pass1_loop
+ lea r2, [rsp+32*55]
+ lea r7, [r2+32*24]
+.pass2_loop:
+ lea r3, [r2+32*8]
+ lea r8, [r7+32*8]
+ mova m0, [r2-32*4]
+ mova m1, [r2-32*2]
+ mova m2, [r2+32*0]
+ mova m3, [r2+32*2]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ test r10b, r10b
+ jnz .fast2
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*2]
+ mova m6, [r3+32*0]
+ mova m7, [r3+32*2]
+.fast2:
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*39]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova m0, [r2-32*3]
+ mova m1, [r2-32*1]
+ mova m2, [r2+32*1]
+ mova m3, [r2+32*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r10b, r10b
+ jnz .fast3
+ mova m4, [r3-32*3]
+ mova m5, [r3-32*1]
+ mova m6, [r3+32*1]
+ mova m7, [r3+32*3]
+.fast3:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [r7-32*4]
+ mova m3, [r7+32*3]
+ mova m4, [r7+32*0]
+ mova m7, [r7-32*1]
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10b, r10b
+ jnz .fast4
+ mova m1, [r8+32*3]
+ mova m2, [r8-32*4]
+ mova m5, [r8-32*1]
+ mova m6, [r8+32*0]
+.fast4:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [r7-32*2]
+ mova m3, [r7+32*1]
+ mova m4, [r7+32*2]
+ mova m7, [r7-32*3]
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10b, r10b
+ jnz .fast5
+ mova m1, [r8+32*1]
+ mova m2, [r8-32*2]
+ mova m5, [r8-32*3]
+ mova m6, [r8+32*2]
+.fast5:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ add r10d, 0x80000000
+ jc .ret
+ lea r2, [rsp+32*7]
+ lea r7, [r2+32*16]
+ sub dstq, r8
+ lea dstq, [dstq+strideq*4+16]
+ jmp .pass2_loop
+.ret:
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+.normal:
+ PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3, tmp4
+ lea tmp1q, [rsp+32*7]
+ lea tmp4d, [eobq-136]
+.pass1_loop:
+ LOAD_8ROWS cq+64*0, 64*4, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+64*2, 64*4, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ vpbroadcastd m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [cq+64* 1]
+ pmulhrsw m1, m7, [cq+64*31]
+ pmulhrsw m2, m7, [cq+64*17]
+ pmulhrsw m3, m7, [cq+64*15]
+ pmulhrsw m4, m7, [cq+64* 9]
+ pmulhrsw m5, m7, [cq+64*23]
+ pmulhrsw m6, m7, [cq+64*25]
+ pmulhrsw m7, [cq+64* 7]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))]
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ pmulhrsw m0, m7, [cq+64* 5]
+ pmulhrsw m1, m7, [cq+64*27]
+ pmulhrsw m2, m7, [cq+64*21]
+ pmulhrsw m3, m7, [cq+64*11]
+ pmulhrsw m4, m7, [cq+64*13]
+ pmulhrsw m5, m7, [cq+64*19]
+ pmulhrsw m6, m7, [cq+64*29]
+ pmulhrsw m7, [cq+64* 3]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*44
+ vpbroadcastd m10, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
+ add cq, 32
+ add tmp4d, 0x80000000
+ jnc .pass1_loop
+ lea tmp1q, [rsp+32*15]
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ mov tmp4b, 4
+.pass2_loop:
+ lea tmp2q, [tmp1q+32*64]
+ LOAD_8ROWS tmp1q-32*4, 32
+ test tmp4d, 0x40000000
+ jnz .fast
+ LOAD_8ROWS_H tmp2q-32*4, 32
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ lea tmp3q, [tmp2q-32*8]
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ mova [rsp], m15
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ lea tmp3q, [tmp1q-32*8]
+ LOAD_8ROWS tmp3q-32*4, 32
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
+ add tmp1q, 32*16
+ sub dstq, r3
+ lea r2, [r2+r3+16]
+ add dstq, 16
+ dec tmp4b
+ jg .pass2_loop
+ RET
+ALIGN function_align
+.transpose_round_interleave:
+ mov tmp3d, 4
+.loop:
+ lea tmp2q, [tmp1q+32*8]
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp2q-32*4], 1
+ vinserti128 m1, [tmp2q-32*3], 1
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp2q-32*2], 1
+ vinserti128 m3, [tmp2q-32*1], 1
+ mova xm4, [tmp1q+32*0]
+ mova xm5, [tmp1q+32*1]
+ vinserti128 m4, [tmp2q+32*0], 1
+ vinserti128 m5, [tmp2q+32*1], 1
+ mova xm6, [tmp1q+32*2]
+ mova xm7, [tmp1q+32*3]
+ vinserti128 m6, [tmp2q+32*2], 1
+ vinserti128 m7, [tmp2q+32*3], 1
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova xm8, [tmp1q-32*4+16]
+ mova xm9, [tmp1q-32*3+16]
+ vinserti128 m8, [tmp2q-32*4+16], 1
+ vinserti128 m9, [tmp2q-32*3+16], 1
+ mova [tmp1q-32*4], m0
+ mova [tmp2q-32*4], m1
+ mova [tmp1q-32*3], m2
+ mova [tmp2q-32*3], m3
+ mova xm2, [tmp1q-32*2+16]
+ mova xm3, [tmp1q-32*1+16]
+ vinserti128 m2, [tmp2q-32*2+16], 1
+ vinserti128 m3, [tmp2q-32*1+16], 1
+ mova [tmp1q-32*2], m4
+ mova [tmp2q-32*2], m5
+ mova [tmp1q-32*1], m6
+ mova [tmp2q-32*1], m7
+ mova xm4, [tmp1q+32*0+16]
+ mova xm5, [tmp1q+32*1+16]
+ vinserti128 m4, [tmp2q+32*0+16], 1
+ vinserti128 m5, [tmp2q+32*1+16], 1
+ mova xm6, [tmp1q+32*2+16]
+ mova xm7, [tmp1q+32*3+16]
+ vinserti128 m6, [tmp2q+32*2+16], 1
+ vinserti128 m7, [tmp2q+32*3+16], 1
+ pmulhrsw m0, m8, m10
+ pmulhrsw m1, m9, m10
+ REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add tmp1q, 32*16
+ dec tmp3d
+ jg .loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+.normal:
+ PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
+ lea tmp1q, [rsp+32*71]
+ lea r10d, [eobq-136]
+.pass1_loop:
+ LOAD_8ROWS cq+64*0, 64*4
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+64*2, 64*4
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ mova m4, [cq+64* 9]
+ mova m5, [cq+64*23]
+ mova m6, [cq+64*25]
+ mova m7, [cq+64* 7]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ mova m4, [cq+64*13]
+ mova m5, [cq+64*19]
+ mova m6, [cq+64*29]
+ mova m7, [cq+64* 3]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*44
+ vpbroadcastd m10, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
+ add cq, 32
+ add r10d, 0x80000000
+ jnc .pass1_loop
+ lea tmp1q, [rsp+32*7]
+ mov r10b, 4
+.pass2_loop:
+ lea r2, [tmp1q+32*64]
+ mova m0, [r2-32*4]
+ mova m1, [r2-32*2]
+ mova m2, [r2+32*0]
+ mova m3, [r2+32*2]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ mova [rsp], m4
+ test r10d, 0x40000000
+ jnz .fast
+ lea r3, [r2+32*64]
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*2]
+ mova m6, [r3+32*0]
+ mova m7, [r3+32*2]
+.fast:
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova m0, [r2-32*3]
+ mova m1, [r2-32*1]
+ mova m2, [r2+32*1]
+ mova m3, [r2+32*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r10d, 0x40000000
+ jnz .fast2
+ mova m4, [r3-32*3]
+ mova m5, [r3-32*1]
+ mova m6, [r3+32*1]
+ mova m7, [r3+32*3]
+.fast2:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add r2, 32*8
+ add r3, 32*8
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [r2-32*4] ; 1
+ mova m3, [r2+32*3] ; 15
+ mova m4, [r2+32*0] ; 9
+ mova m7, [r2-32*1] ; 7
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10d, 0x40000000
+ jnz .fast3
+ mova m1, [r3+32*3] ; 31
+ mova m2, [r3-32*4] ; 17
+ mova m5, [r3-32*1] ; 23
+ mova m6, [r3+32*0] ; 25
+.fast3:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [r2-32*2] ; 5
+ mova m3, [r2+32*1] ; 11
+ mova m4, [r2+32*2] ; 13
+ mova m7, [r2-32*3] ; 3
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10d, 0x40000000
+ jnz .fast4
+ mova m1, [r3+32*1] ; 27
+ mova m2, [r3-32*2] ; 21
+ mova m5, [r3-32*3] ; 19
+ mova m6, [r3+32*2] ; 29
+.fast4:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ sub tmp1q, 32*28
+ sub dstq, r8
+ lea dstq, [dstq+strideq*4+16]
+ dec r10b
+ jg .pass2_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm
new file mode 100644
index 0000000000..31c60fdd45
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx512.asm
@@ -0,0 +1,7389 @@
+; Copyright © 2020-2023, VideoLAN and dav1d authors
+; Copyright © 2020-2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+const \
+dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
+ db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
+ db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
+const \
+int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23
+ db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
+ db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
+idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9
+ db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
+ db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
+ db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
+idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35
+ db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
+ db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51
+ db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37
+end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60
+ db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61
+ db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
+ db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
+
+; packed 4-bit qword shuffle indices
+permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
+ dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
+ dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
+ dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
+permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
+ dq 0xc824352d56128751, 0xd906171e74301e15
+ dq 0x6271604b03472d62, 0x735342782165b426
+ dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
+permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486
+ dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
+ dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
+ dq 0x5115049dd9045b79, 0x733726bffb263d1f
+permD: dq 0x0cda098800041504, 0x0edb09b2028c3726
+ dq 0x0f11fa9c01150415, 0x0988f326039d2637
+ dq 0x05640f1108269d8c, 0x05290edb0aaebfae
+ dq 0x0005000509378c9d, 0xffffffff0bbfaebf
+
+pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11
+gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13
+gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10
+gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16
+
+int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0
+
+pb_32: times 4 db 32
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_16384: times 2 dw 16384
+pw_1697x16: times 2 dw 1697*16
+pw_1697x8: times 2 dw 1697*8
+pw_2896x8: times 2 dw 2896*8
+pd_2048: dd 2048
+
+%define pw_5 (permD+52)
+%define pd_m1 (permD+60)
+%define pw_3803_1321 (permD+44)
+%define pw_2482_3803 (permD+12)
+%define pw_2440_3290 (permD+ 4)
+%define pw_m3290_2440 (permD+28)
+%define pw_3857_1380 (permD+36)
+%define pw_m1380_3857 (permD+20)
+
+pw_8192_m8192: dw 8192, -8192
+pw_m8192_8192: dw -8192, 8192
+pw_16384_m16384: dw 16384, -16384
+pw_m16384_16384: dw -16384, 16384
+
+pw_m1321_2482: dw -1321, 2482
+pw_m3344_3344: dw -3344, 3344
+pw_2482_3344: dw 2482, 3344
+pw_m3803_3344: dw -3803, 3344
+pd_3344: dd 3344
+pw_m1321_m3344: dw -1321, -3344
+pw_2896_m2896: dw 2896, -2896
+
+pw_1567_m3784: dw 1567, -3784
+pw_3784_m1567: dw 3784, -1567
+pw_4017_m799: dw 4017, -799
+pw_2276_m3406: dw 2276, -3406
+pw_m799_m4017: dw -799, -4017
+pw_m3406_m2276: dw -3406, -2276
+
+%macro COEF_PAIR 2-3 0
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%if %3
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784, 1
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 4052, 601
+COEF_PAIR 3166, 2598, 1
+COEF_PAIR 3920, 1189, 1
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_m2276x8: COEF_X8 -2276
+pw_3406x8: COEF_X8 3406
+pw_4017x8: COEF_X8 4017
+pw_799x8: COEF_X8 799
+pw_3784x8: COEF_X8 3784
+pw_1567x8: COEF_X8 1567
+
+pw_4076x8: COEF_X8 4076
+pw_401x8: COEF_X8 401
+pw_m2598x8: COEF_X8 -2598
+pw_3166x8: COEF_X8 3166
+pw_3612x8: COEF_X8 3612
+pw_1931x8: COEF_X8 1931
+pw_m1189x8: COEF_X8 -1189
+pw_3920x8: COEF_X8 3920
+
+pw_4091x8: COEF_X8 4091
+pw_201x8: COEF_X8 201
+pw_m2751x8: COEF_X8 -2751
+pw_3035x8: COEF_X8 3035
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+pw_401_4076x8: dw 401*8, 4076*8
+pw_m2598_3166x8: dw -2598*8, 3166*8
+pw_1931_3612x8: dw 1931*8, 3612*8
+pw_m1189_3920x8: dw -1189*8, 3920*8
+pw_799_4017x8: dw 799*8, 4017*8
+pw_m2276_3406x8: dw -2276*8, 3406*8
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+pw_101_4095x8: dw 101*8, 4095*8
+pw_m2824_2967x8: dw -2824*8, 2967*8
+pw_1660_3745x8: dw 1660*8, 3745*8
+pw_m1474_3822x8: dw -1474*8, 3822*8
+pw_897_3996x8: dw 897*8, 3996*8
+pw_m2191_3461x8: dw -2191*8, 3461*8
+pw_2359_3349x8: dw 2359*8, 3349*8
+pw_m700_4036x8: dw -700*8, 4036*8
+pw_501_4065x8: dw 501*8, 4065*8
+pw_m2520_3229x8: dw -2520*8, 3229*8
+pw_2019_3564x8: dw 2019*8, 3564*8
+pw_m1092_3948x8: dw -1092*8, 3948*8
+pw_1285_3889x8: dw 1285*8, 3889*8
+pw_m1842_3659x8: dw -1842*8, 3659*8
+pw_2675_3102x8: dw 2675*8, 3102*8
+pw_m301_4085x8: dw -301*8, 4085*8
+
+idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474
+COEF_PAIR 401, 4076, 1
+COEF_PAIR 799, 4017
+ COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996
+dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017
+ COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092
+COEF_PAIR 1931, 3612, 1
+COEF_PAIR 3406, 2276
+ COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889
+dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276
+
+SECTION .text
+
+%define o_base int8_permA+64*18
+%define o(x) (r5 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
+; 16 = special_mul1, 32 = special_mul2
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+ mova m%2, m%4
+%if %7 & 16
+ vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd}
+ mova m%3, m%4
+%if %7 & 32
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%else
+ vpdpwssd m%3, m%1, m%6
+%endif
+%elif %7 & 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%elif %6 < 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, m%6
+%elif %7 & 1
+ vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd}
+%else
+ vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd}
+%endif
+%if %7 & 2
+ psrld m%2, 12
+ pslld m%3, 4
+ vpshrdd m%1, m%3, m%2, 16
+%elif %7 & 4
+ ; compared to using shifts (as above) this has better throughput,
+ ; but worse latency and requires setting up the opmask/index
+ ; registers, so only use this method for the larger transforms
+ pslld m%1, m%2, 4
+ vpmultishiftqb m%1{k7}, m13, m%3
+%else
+ psrad m%2, 12
+ psrad m%3, 12
+%if %7 & 8 == 0
+ packssdw m%1, m%3, m%2
+%endif
+%endif
+%endmacro
+
+; flags: same as ITX_MUL2X_PACK
+%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
+%if %11 & 1
+ vpbroadcastd m%4, [o(pw_%9_%10)]
+ vpbroadcastd m%4{k1}, [o(pw_%7_%8)]
+ vpbroadcastd m%5, [o(pw_m%10_%9)]
+ vpbroadcastd m%5{k1}, [o(pw_m%8_%7)]
+%else
+ vpbroadcastd m%4, [o(pw_m%10_%9)]
+ vpbroadcastd m%4{k1}, [o(pw_m%8_%7)]
+ vpbroadcastd m%5, [o(pw_%9_%10)]
+ vpbroadcastd m%5{k1}, [o(pw_%7_%8)]
+%endif
+ ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpcklwd m%3, m%2, m%1
+ punpckhwd m%2, m%1
+%if %7 < 32
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, m%7
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, m%7
+%else
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd}
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd}
+%endif
+ psrad m%1, 12
+ psrad m%4, 12
+ packssdw m%1, m%4
+ mova m%4, m%5
+%if %7 < 32
+ vpdpwssd m%4, m%2, m%6
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, m%6
+%else
+ vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd}
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd}
+%endif
+ psrad m%4, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%4
+%else
+ packssdw m%2, m%4
+%endif
+%endmacro
+
+%macro WRAP_XMM 1+
+ %xdefine %%reset RESET_MM_PERMUTATION
+ INIT_XMM cpuname
+ DEFINE_MMREGS xmm
+ AVX512_MM_PERMUTATION
+ %1
+ %%reset
+%endmacro
+
+%macro WRAP_YMM 1+
+ INIT_YMM cpuname
+ %1
+ INIT_ZMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea baseq, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m4, m1, m0 ; in2 in0
+ punpckhwd m5, m1, m0 ; in3 in1
+.main2:
+ vpbroadcastd m3, [o(pd_2048)]
+ mova m0, m3
+ vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd}
+ mova m2, m3
+ vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd}
+ mova m1, m3
+ vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd}
+ vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd}
+ vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd}
+ vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd}
+ vpdpwssd m1, m5, [o(pd_3344)] {bcstd}
+ vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd}
+ REPX {psrad x, 12}, m0, m2, m1, m3
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m3 ; out2 out3
+%endmacro
+
+INIT_XMM avx512icl
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xmm1, [o(pw_2896x8)]
+ pmulhrsw xmm0, xmm1, [cq]
+ movd xmm2, [o(pw_2048)]
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm2
+ vpbroadcastw ym0, xmm0
+ mova ym1, ym0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+.main2:
+ vpbroadcastd m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
+%if mmsize > 16
+ vbroadcasti32x4 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ punpckhqdq m2, m4, m0 ; t7 t6
+ punpcklqdq m4, m0 ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ vprord m1, 16
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ mova m2, m6
+ vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd}
+ mova m5, m6
+ vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd}
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ mova m5, m6
+ vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd}
+ mova m3, m6
+ vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd}
+ psrad m5, 12
+ psrad m3, 12
+ packssdw m1, m3, m5 ; out2 -out3
+%else
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t6 t7
+ paddsw m4, m5 ; -out1 out6
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckhqdq m3, m2, m1 ; t3 t7
+ punpcklqdq m2, m1 ; t2 t6
+ paddsw m1, m2, m3 ; t2+t3 t6+t7
+ psubsw m2, m3 ; t2-t3 t6-t7
+ punpckhqdq m3, m4, m0 ; out6 -out7
+ punpcklqdq m0, m4 ; out0 -out1
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, m0, xm2, 1
+ vinserti32x4 m1, m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ punpcklqdq m4, m5
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq], zmm20
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd xm0, xm4, xm3 ; 0 7
+ punpckhwd xm1, xm5, xm2 ; 2 5
+ punpcklwd xm2, xm5 ; 4 3
+ punpcklwd xm3, xm4 ; 6 1
+ WRAP_XMM IADST8_1D_PACKED 1
+ punpcklqdq xm3, xm4, xm0 ; out6 -out7
+ punpckhqdq xm0, xm4 ; out0 -out1
+ ret
+ALIGN function_align
+.main_pass2:
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti32x4 m3, xm1, 1
+ vinserti32x4 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INIT_ZMM avx512icl
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m0, [cq]
+ mova m1, [o(int8_permB)]
+ vpbroadcastd m2, [o(pw_1697x8)]
+ vpermb m0, m1, m0
+ pmulhrsw m2, m0
+ paddsw m0, m2
+ vextracti32x8 ym1, m0, 1
+ jmp tx2q
+.pass2:
+ vpbroadcastd ym4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ vpbroadcastw m0, r6d
+ mova m1, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+cglobal_label .main2
+ vpbroadcastd m10, [o(pd_2048)]
+.main3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a
+.main4:
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m4, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2
+ psubsw m0, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+.main5:
+ ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a
+ ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
+%if mmsize > 16
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+%else
+ mova m5, [o(deint_shuf)]
+%endif
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ paddsw m3, m2, m4 ; t9 t14
+ psubsw m2, m4 ; t10 t13
+ pshufb m8, m5
+ pshufb m7, m5
+ pshufb m3, m5
+ ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6
+ ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12
+ ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a
+ punpckhqdq m2, m7, m1 ; t7 t6
+ punpcklqdq m7, m1 ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ packssdw m5, m11 ; t12 t13a
+ packssdw m4, m0 ; t11 t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova ym1, [cq+32*2]
+ vinserti32x8 m1, [cq+32*0], 1
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+32*3]
+ vinserti32x8 m2, [cq+32*1], 1
+ vpbroadcastd m4, [o(pd_2048)]
+ vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
+ vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2
+ vpbroadcastd m4, [o(pw_16384)]
+ psubsw m3, m1, m2
+ paddsw m1, m2 ; out0 out1
+ vprord m3, 16 ; out2 out3
+ punpckldq m0, m1, m3
+ punpckhdq m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, ym1, 1
+ vextracti32x4 xm4, m0, 2
+ vextracti32x4 xm5, m1, 2
+ vextracti32x4 xm6, m0, 3
+ vextracti32x4 xm7, m1, 3
+ call .main
+ vinserti32x4 ym0, xm2, 1
+ vinserti32x4 ym1, xm3, 1
+ vinserti32x4 ym4, xm6, 1
+ vinserti32x4 ym5, xm7, 1
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m1, ym5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m3
+ pmulhrsw m0, m3
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m6, m8, m5
+.end:
+ vpbroadcastd m7, [o(pw_2896x8)]
+ paddsw ym1, ym2, ym4
+ psubsw ym2, ym4
+ vinserti32x8 m1, ym2, 1
+ pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10
+ psrlq m0, m10, 4
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpcklqdq m5, m6
+.end2:
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorw k1, k1, k1
+ kmovw k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main:
+ movu m3, [o(permB+1)]
+ psrlq m10, m3, 4
+.main2:
+ vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10
+ vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ kshiftrb k1, k1, 4
+ vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5
+ vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9
+INIT_YMM avx512icl
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
+ ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m4, m1 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m4 ; t5 t4 t7 t6
+ paddsw m0, m4 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a
+ vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14
+ vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+INIT_ZMM avx512icl
+ vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 12
+ psubw m5, m8, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m2, [o(int16_perm)]
+ vpermb m1, m2, [cq+64*0]
+ vpermb m2, m2, [cq+64*1]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ vpbroadcastd m0, [o(pd_m1)]
+ pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is
+ vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal
+ pmulhrsw m4, m2 ; it still works, but if the input is -1 the
+ vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
+ vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless
+ vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here.
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti32x4 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_8x4_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti32x4 m4, [o(deint_shuf)]
+ vinserti32x4 m3, m1, xm3, 1
+ vinserti32x4 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m3, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+.end3:
+ pxor m2, m2
+ mova [cq], zmm18
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m3, m3, xm1, 1
+ vinserti32x4 m2, m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti32x4 m2, [cq+16*2], 1
+ vinserti32x4 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+INIT_ZMM avx512icl
+ movsx r6d, word [cq]
+ mov [cq], eobd
+.dconly:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly2:
+ vpbroadcastd ym2, strided
+ imul r6d, 181
+ pmulld ym5, ym2, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m3, m3
+ vpbroadcastw m4, r6d
+.dconly_loop:
+ kmovb k2, k1
+ vpgatherdq m2{k1}, [dstq+ym5]
+ punpcklbw m0, m2, m3
+ punpckhbw m1, m2, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ kmovb k1, k2
+ vpscatterdq [dstq+ym5]{k2}, m0
+ lea dstq, [dstq+strideq*8]
+ sub r3d, 8
+ jg .dconly_loop
+ RET
+INIT_YMM avx512icl
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti32x4 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti32x4 m0, m4, xm1, 1
+ vshufi32x4 m2, m4, m1, 0x03
+ vinserti32x4 m1, m5, xm3, 1
+ vshufi32x4 m3, m5, m3, 0x03
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384_m16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ REPX {pmulhrsw x, m5}, m3, m4, m0, m1
+ vshufi32x4 m2, m3, m0, 0x03
+ vinserti32x4 m0, m3, xm0, 1
+ vshufi32x4 m3, m4, m1, 0x03
+ vinserti32x4 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ IADST8_1D_PACKED 1
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_m16384_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m3, m2
+ vinserti32x4 m1, m0, xm3, 1
+ vshufi32x4 m3, m0, m3, 0x03
+ vinserti32x4 m0, m4, xm2, 1
+ vshufi32x4 m2, m4, m2, 0x03
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti32x4 m3, [cq+16*4], 1
+ vinserti32x4 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti32x4 m4, [cq+16*6], 1
+ vinserti32x4 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m3, [o(permB)]
+ vpermq m0, m3, [cq+64*0]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m3, [cq+64*2]
+ vpermq m3, m3, [cq+64*3]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3
+ REPX {pmulhrsw x, m5}, m4, m0, m2, m1
+ punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
+ punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1
+ punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
+ punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1
+ punpckhdq m1, m0, m2 ; 1 5 9 13
+ punpckldq m0, m2 ; 0 4 8 12
+ punpckldq m2, m3, m4 ; 2 6 10 14
+ punpckhdq m3, m4 ; 3 7 11 15
+ jmp tx2q
+.pass2:
+ vprord m5, [o(int16_perm)], 16
+ vshufi32x4 m2, m2, q1320 ; 2 10 14 6
+ vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11
+ vshufi32x4 m1, m3, q0132 ; 9 13 7 3
+ vpermb m9, m5, m0
+ vpermb m7, m5, m2
+ vpermb m8, m5, m4
+ vpermb m0, m5, m1
+ vextracti32x8 ym6, m9, 1
+ vextracti32x8 ym3, m7, 1
+ vextracti32x8 ym5, m8, 1
+ vextracti32x8 ym1, m0, 1
+ call .main2
+ mova ym8, [o(gather8a)]
+ lea r3, [dstq+strideq*4]
+ pmovzxdq m9, ym8
+ pshufd ym8, ym8, q1230
+ vpermt2q m0, m9, m4
+ vpermt2q m1, m9, m5
+ vpermt2q m2, m9, m6
+ vpermt2q m3, m9, m7
+.end:
+ vpbroadcastd m7, [o(pw_2048)]
+.end2:
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+.end3:
+ pmulhrsw m2, m7
+ pmulhrsw m3, m7
+.end4:
+ vpbroadcastd ym6, strided
+ kxnorb k1, k1, k1
+ pxor m4, m4
+ pmulld ym8, ym6
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [dstq+ym8]
+ kmovb k1, k2
+ vpgatherdq m7{k2}, [r3+ym8]
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ kmovb k2, k1
+ mova [cq+64*2], m4
+ mova [cq+64*3], m4
+ punpcklbw m5, m6, m4
+ punpckhbw m6, m4
+ paddw m0, m5
+ paddw m1, m6
+ packuswb m0, m1
+ vpscatterdq [dstq+ym8]{k1}, m0
+ punpcklbw m6, m7, m4
+ punpckhbw m7, m4
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m2, m3
+ vpscatterdq [r3+ym8]{k2}, m2
+ RET
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ vpbroadcastd ym3, [o(pw_401_4076x8)]
+ vpbroadcastd ym5, [o(pw_799_4017x8)]
+ vpbroadcastd ym4, [o(pw_m1189_3920x8)]
+ pxor ym6, ym6
+ punpckhwd ym2, ym0, ym0
+ pmulhrsw ym2, ym3 ; t8a t15a
+ punpcklwd ym7, ym1, ym1
+ pmulhrsw ym7, ym5 ; t4a t7a
+ punpckhwd ym1, ym1
+ pmulhrsw ym4, ym1 ; t11a t12a
+ vpcmpub k7, ym13, ym10, 6
+ punpcklwd ym9, ym6, ym0
+ psubsw ym0, ym2, ym4 ; t11a t12a
+ paddsw ym8, ym2, ym4 ; t8a t15a
+ mova ym1, ym7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ pxor ym6, ym6
+ punpckhwd ym8, ym0, ym0
+ punpckhwd ym4, ym3, ym3
+ punpckhwd ym5, ym2, ym2
+ punpcklwd ym7, ym1, ym1
+ punpckhwd ym1, ym1
+ punpcklwd ym3, ym3
+ punpcklwd ym9, ym6, ym0
+ punpcklwd ym6, ym2
+ vpbroadcastd ym2, [o(pw_401_4076x8)]
+ vpbroadcastd ym0, [o(pw_m2598_3166x8)]
+ vpbroadcastd ym11, [o(pw_1931_3612x8)]
+ vpbroadcastd ym12, [o(pw_m1189_3920x8)]
+ pmulhrsw ym8, ym2 ; t8a t15a
+ vpbroadcastd ym2, [o(pw_799_4017x8)]
+ pmulhrsw ym0, ym4 ; t9a t14a
+ vpbroadcastd ym4, [o(pw_m2276_3406x8)]
+ pmulhrsw ym5, ym11 ; t10a t13a
+ pmulhrsw ym1, ym12 ; t11a t12a
+ pmulhrsw ym7, ym2 ; t4a t7a
+ pmulhrsw ym3, ym4 ; t5a t6a
+ vpcmpub k7, ym13, ym10, 6
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ WRAP_YMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf1)]
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m3, m5, m4, m2
+ punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m7, m8, m6
+.pass2_end:
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m5, m2 ; out8 -out11 -out9 out10
+ mova ym8, [o(gather8c)]
+ lea r3, [dstq+strideq]
+ psrlq m2, m10, 4
+ vpermi2q m2, m0, m3 ; 1 3 13 15
+ vpermt2q m0, m10, m3 ; 0 2 12 14
+ psrlq m3, m10, 8
+ vpermi2q m3, m1, m5 ; 5 7 9 11
+ psrlq m10, 12
+ vpermt2q m1, m10, m5 ; 4 6 8 10
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp m(idct_8x16_internal_8bpc).end3
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m5, m2, [cq+64*0]
+ pmulhrsw m3, m2, [cq+64*3]
+ pmulhrsw m1, m2, [cq+64*1]
+ pmulhrsw m2, [cq+64*2]
+ movu m4, [o(permA+3)]
+ psrlq m10, m4, 4
+ mova m6, m4
+ vpermi2q m4, m5, m3 ; in0 in12 in2 in14
+ vpermt2q m5, m10, m3 ; in15 in3 in13 in1
+ vpermi2q m6, m1, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m2 ; in11 in7 in9 in5
+ jmp .main
+ALIGN function_align
+.main_pass2:
+ mova m4, [o(permC)]
+ psrlq m5, m4, 4
+ vpermi2q m4, m0, m2 ; in0 in12 in2 in14
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; in15 in3 in13 in1
+ psrlq m10, m6, 4
+ vpermi2q m6, m0, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m3 ; in11 in7 in9 in5
+.main:
+ punpcklwd m0, m4, m5 ; in0 in15 in2 in13
+ punpckhwd m4, m5 ; in12 in3 in14 in1
+ punpcklwd m5, m6, m1 ; in4 in11 in6 in9
+ punpckhwd m6, m1 ; in8 in7 in10 in5
+cglobal_label .main2
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5
+ ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
+ psubsw m2, m0, m6 ; t9a t8a t11a t10a
+ paddsw m0, m6 ; t1a t0a t3a t2a
+ psubsw m3, m5, m4 ; t13a t12a t15a t14a
+ paddsw m5, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m5 ; t5 t4 t7 t6
+ paddsw m0, m5 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a
+ vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15
+ vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12
+ pshufd m2, m2, q1032 ; t7a t6a t15 t14
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf2)]
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+ jmp m(iadst_8x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main_pass2
+ vpbroadcastd m7, [o(pw_2048)]
+ psrlq m10, 36
+ psubw m6, m8, m7
+ jmp m(iadst_8x16_internal_8bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [o(int16_perm)]
+ vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1
+ punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3
+ REPX {pmulhrsw x, m5}, m1, m2, m3, m4
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1
+ punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2
+ punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova ym8, [o(gather8b)]
+ lea r3, [dstq+strideq*2]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti32x4 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_16384)]
+ vinserti32x4 ym1, xm3, 1 ; 3 2 7 6
+ vinserti32x4 ym5, xm7, 1 ; b a f e
+ vinserti32x4 ym0, xm2, 1 ; 0 1 4 5
+ vinserti32x4 ym4, xm6, 1 ; 8 9 c d
+ vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e
+ vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d
+ pmulhrsw m1, m8
+ pmulhrsw m0, m8
+ pshufd m1, m1, q1032
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ mova m2, [o(permA)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_16384_m16384)]
+ psrlq m0, m10, 4
+ psrlq m10, 8
+.pass1_end:
+ punpcklwd ym5, ym4, ym2
+ punpckhwd ym4, ym2
+ vinserti32x8 m5, ym4, 1
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m4, m9
+ vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp tx2q
+.pass2:
+ call .main
+ movu m2, [o(permA+1)]
+.end:
+ vpbroadcastd m3, [o(pw_2048)]
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+.end2:
+ psrlq m3, m2, 4
+ vpermi2q m2, m0, m1
+ vpermi2q m3, m0, m1
+.end3:
+ lea r3, [dstq+strideq*2]
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [r3 +strideq*0], 2
+ vinserti32x4 m1, [r3 +strideq*1], 3
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_m16384_16384)]
+ psrlq m0, m10, 12
+ psrlq m10, 16
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ movu m2, [o(permA+2)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ mova m2, [cq+64*1]
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m4, [o(pw_16384)]
+ mova m5, [o(idtx_16x4p)]
+ shufps m0, m1, m2, q2020
+ shufps m1, m2, q3131
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddsw m0, m2
+ paddsw m1, m3
+ vpermb m0, m5, m0
+ vpermb m1, m5, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ movu m2, [o(permA+1)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly3:
+ imul r6d, 181
+ lea r2, [strideq*3]
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [dstq+strideq*2], 2
+ vinserti32x4 m1, [dstq+r2 ], 3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [o(pw_2896x8)]
+ vpermq m0, [cq+64*0], q3120
+ vpermq m2, [cq+64*1], q3120
+ vpermq m4, [cq+64*2], q3120
+ vpermq m6, [cq+64*3], q3120
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6
+ vextracti32x8 ym1, m0, 1
+ vextracti32x8 ym3, m2, 1
+ vextracti32x8 ym5, m4, 1
+ vextracti32x8 ym7, m6, 1
+ call m(idct_8x16_internal_8bpc).main
+ vbroadcasti32x4 m8, [o(int_shuf1)]
+ vbroadcasti32x4 m9, [o(int_shuf2)]
+ vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
+ vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
+ vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
+ vpbroadcastd m2, [o(pw_16384)]
+ pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3
+ pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
+ pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
+ REPX {pmulhrsw x, m2}, m0, m1, m6, m7
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020 ; 0 1
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020 ; 2 3
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ call .main
+ movshdup m4, [o(permC)]
+ psrlq m6, m4, 4
+ vpermq m5, m4, q1032
+ vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3
+ vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3
+ vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1
+ vpbroadcastd m6, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m6}, m0, m4, m1, m5
+.end2:
+ lea r3, [dstq+strideq*4]
+ lea r4, [strideq*3]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r4 ], 1
+ vinserti32x4 m3, [r3 +strideq*0], 2
+ vinserti32x4 m6, [r3 +strideq*2], 2
+ vinserti32x4 m3, [r3 +strideq*1], 3
+ vinserti32x4 m6, [r3 +r4 ], 3
+ pxor m7, m7
+ mova [cq+64*0], m7
+ mova [cq+64*1], m7
+ mova [cq+64*2], m7
+ mova [cq+64*3], m7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m4, m3
+ packuswb m0, m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ punpcklbw m3, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m3
+ paddw m5, m6
+ packuswb m1, m5
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r4 ], ym1, 1
+ vextracti32x4 [r3 +strideq*2], m1, 2
+ vextracti32x4 [r3 +r4 ], m1, 3
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ psrlq m10, 4
+.pass1_end:
+ punpcklwd m5, m4, m2
+ punpckhwd m4, m2
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m6, m9
+ vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16}
+ mova m2, m9
+ vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16}
+ vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m6, 12
+ packssdw m1, m6 ; out8 -out7 -out9 out6
+ psrad m2, 12
+ psrad m9, 12
+ packssdw m2, m9 ; -out11 out4 out10 -out5
+ psrlq m4, m10, 4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m10, m2
+ psrlq m5, m10, 8
+ vpermi2q m5, m1, m3
+ psrlq m10, 12
+ vpermt2q m1, m10, m3
+ punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
+ punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3
+ REPX {pmulhrsw x, m7}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call .main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ psrlq m6, m4, 4
+ mova m5, m4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m6, m2
+ vpermi2q m5, m1, m3
+ vpermt2q m1, m6, m3
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m3, m4, [cq+64*0]
+ pmulhrsw m1, m4, [cq+64*3]
+ pmulhrsw m2, m4, [cq+64*1]
+ pmulhrsw m4, [cq+64*2]
+ mova m5, [o(int16_perm)]
+ kxnorb k1, k1, k1
+ vpblendmd m0{k1}, m1, m3 ; 0 7
+ vmovdqa32 m3{k1}, m1 ; 6 1
+ vpblendmd m1{k1}, m4, m2 ; 2 5
+ vmovdqa32 m2{k1}, m4 ; 4 3
+ REPX {vpermb x, m5, x}, m0, m1, m2, m3
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ pxor m5, m5
+ psubd m5, m6
+ packssdw m6, m5
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ psrlq m10, 20
+ jmp m(iadst_16x8_internal_8bpc).pass1_end
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m0, m6, m1
+ psrlq m1, m4, 12
+ psrlq m4, 8
+ mova m7, m4
+ vpermi2q m4, m0, m3
+ vpermt2q m0, m1, m3
+ vpermi2q m1, m5, m2
+ vpermt2q m5, m7, m2
+ jmp m(idct_16x8_internal_8bpc).end2
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*2]
+ pmulhrsw m0, [cq+64*3]
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
+ shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7
+ shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
+ shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7
+ mova m9, [o(int8_permA)]
+ pmulhrsw m0, m7, m2
+ pmulhrsw m1, m7, m3
+ pmulhrsw m6, m7, m4
+ pmulhrsw m7, m5
+ REPX {pmulhrsw x, m8}, m0, m1, m6, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ REPX {vpermb x, m9, x}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ mova m7, [o(permB)]
+ vpbroadcastd m6, [o(pw_4096)]
+ vpermq m0, m7, m2
+ vpermq m4, m7, m4
+ vpermq m1, m7, m3
+ vpermq m5, m7, m5
+ jmp m(idct_16x8_internal_8bpc).end
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m7, [o(permB)]
+ vpermq m0, m7, [cq+64*0]
+ vpermq m1, m7, [cq+64*1]
+ vpermq m2, m7, [cq+64*2]
+ vpermq m3, m7, [cq+64*3]
+ vpermq m4, m7, [cq+64*4]
+ vpermq m5, m7, [cq+64*5]
+ vpermq m6, m7, [cq+64*6]
+ vpermq m7, m7, [cq+64*7]
+ call .main
+ vbroadcasti32x4 m12, [o(int_shuf1)]
+ vbroadcasti32x4 m11, [o(int_shuf2)]
+ vpbroadcastd m13, [o(pw_8192)]
+ pshufb m0, m12
+ pshufb m8, m1, m11
+ pshufb m2, m12
+ pshufb m9, m3, m11
+ pshufb m4, m12
+ pshufb m10, m5, m11
+ pshufb m6, m12
+ pshufb m11, m7, m11
+ REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
+ punpckhdq m1, m0, m8
+ punpckldq m0, m8
+ punpckhdq m3, m2, m9
+ punpckldq m2, m9
+ punpckhdq m5, m4, m10
+ punpckldq m4, m10
+ punpckhdq m7, m6, m11
+ punpckldq m6, m11
+ jmp tx2q
+.pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+ call .main
+ mova m8, [o(permD)]
+ psrlq m12, m8, 4
+ psrlq m9, m8, 8
+ psrlq m13, m8, 12
+ mova m10, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ mova m11, m9
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m13, m3
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m12, m6
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+.end:
+ vpbroadcastd m12, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m12}, m0, m1, m4, m5
+.end3:
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ lea r5, [dstq+strideq*8]
+ lea r6, [r4 +strideq*8]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r3 ], 1
+ vinserti32x4 m3, [r4+strideq*0], 2
+ vinserti32x4 m6, [r4+strideq*2], 2
+ vinserti32x4 m3, [r4+strideq*1], 3
+ vinserti32x4 m6, [r4+r3 ], 3
+ mova xm12, [r5+strideq*0]
+ mova xm13, [r5+strideq*2]
+ vinserti32x4 ym12, [r5+strideq*1], 1
+ vinserti32x4 ym13, [r5+r3 ], 1
+ vinserti32x4 m12, [r6+strideq*0], 2
+ vinserti32x4 m13, [r6+strideq*2], 2
+ vinserti32x4 m12, [r6+strideq*1], 3
+ vinserti32x4 m13, [r6+r3 ], 3
+ pxor m7, m7
+ REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m8, m3
+ packuswb m0, m8
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m2
+ paddw m9, m6
+ packuswb m1, m9
+ punpcklbw m2, m12, m7
+ punpckhbw m12, m7
+ paddw m2, m4
+ paddw m10, m12
+ packuswb m2, m10
+ punpcklbw m3, m13, m7
+ punpckhbw m13, m7
+ paddw m3, m5
+ paddw m11, m13
+ packuswb m3, m11
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r3 ], ym1, 1
+ vextracti32x4 [r4+strideq*0], m0, 2
+ vextracti32x4 [r4+strideq*1], m0, 3
+ vextracti32x4 [r4+strideq*2], m1, 2
+ vextracti32x4 [r4+r3 ], m1, 3
+ mova [r5+strideq*0], xm2
+ vextracti32x4 [r5+strideq*1], ym2, 1
+ mova [r5+strideq*2], xm3
+ vextracti32x4 [r5+r3 ], ym3, 1
+ vextracti32x4 [r6+strideq*0], m2, 2
+ vextracti32x4 [r6+strideq*1], m2, 3
+ vextracti32x4 [r6+strideq*2], m3, 2
+ vextracti32x4 [r6+r3 ], m3, 3
+ RET
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast4:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m1189_3920x8)]
+ vpbroadcastd m3, [o(pw_799_4017x8)]
+ pmulhrsw m2, m8 ; t8a t15a
+ pmulhrsw m4, m1 ; t11a t12a
+ pmulhrsw m7, m3 ; t4a t7a
+ pxor m6, m6
+ psubsw m0, m2, m4 ; t11a t12a
+ paddsw m8, m2, m4 ; t8a t15a
+ mova m1, m7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd m10, [o(pd_2048)]
+.main_fast3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast5:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m2598_3166x8)]
+ vpbroadcastd m11, [o(pw_1931_3612x8)]
+ vpbroadcastd m12, [o(pw_m1189_3920x8)]
+ pmulhrsw m8, m2 ; t8a t15a
+ vpbroadcastd m2, [o(pw_799_4017x8)]
+ pmulhrsw m0, m4 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m2276_3406x8)]
+ pmulhrsw m5, m11 ; t10a t13a
+ pmulhrsw m1, m12 ; t11a t12a
+ pmulhrsw m7, m2 ; t4a t7a
+ pmulhrsw m3, m4 ; t5a t6a
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call .main_pass1
+ vpbroadcastd m10, [o(pw_8192_m8192)]
+ punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m12, m3
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0xff00ff00
+ mova m11, m10
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m13, m6
+ kmovd k1, r3d
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+ pxor m7, m7
+ vpsubw m12{k1}, m7, m12
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ mova m4, [o(permB)]
+ psrlq m3, m4, 4
+ vpermq m0, m4, [cq+64*0]
+ vpermq m7, m3, [cq+64*7]
+ vpermq m6, m4, [cq+64*6]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m4, [cq+64*2]
+ vpermq m5, m3, [cq+64*5]
+ vpermq m4, m4, [cq+64*4]
+ vpermq m3, m3, [cq+64*3]
+ call .main
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ mova m2, m10
+ vpdpwssd m2, m5, m13 ; -out5
+ mova m8, m10
+ vpdpwssd m8, m11, m13 ; out4
+ mova m9, m10
+ vpdpwssd m9, m5, m12 ; out10
+ mova m5, m10
+ vpdpwssd m5, m11, m12 ; -out11
+ mova m11, m10
+ vpdpwssd m11, m3, m13 ; -out7
+ mova m14, m10
+ vpdpwssd m14, m4, m13 ; out6
+ mova m13, m10
+ vpdpwssd m13, m3, m12 ; out8
+ vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
+ REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
+ packssdw m2, m8 ; -out5 out4
+ packssdw m5, m9, m5 ; out10 -out11
+ packssdw m3, m11, m14 ; -out7 out6
+ packssdw m4, m13, m10 ; out8 -out9
+ ret
+ALIGN function_align
+.main_pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+cglobal_label .main_pass2b
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ call .main
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m9, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ shufps m3, m4, q3210 ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m9 ; -out5 out4
+ psubsw m5, m9 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+ALIGN function_align
+.main:
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9
+ ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13
+ ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11
+ ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a
+ ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a
+ ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14
+ ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12
+ vbroadcasti32x4 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ shufps m0, m6, m0, q3210 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x16_internal_8bpc).main_pass1
+ vpbroadcastd m10, [o(pw_m8192_8192)]
+ punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp m(iadst_16x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m7, m5 ; 0 1 4 5
+ vpermt2q m7, m12, m5
+ vpermi2q m9, m6, m4 ; 2 3 6 7
+ vpermt2q m6, m12, m4
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0x00ff00ff
+ mova m11, m10
+ vpermi2q m10, m3, m1 ; 8 9 12 13
+ vpermt2q m3, m13, m1
+ kmovd k1, r3d
+ vpermi2q m11, m2, m0 ; 10 11 14 15
+ vpermt2q m2, m13, m0
+ pxor m0, m0
+ vpsubw m12{k1}, m0, m12
+ pmulhrsw m0, m7, m12
+ pmulhrsw m1, m6, m12
+ pmulhrsw m4, m3, m12
+ pmulhrsw m5, m2, m12
+ jmp m(idct_16x16_internal_8bpc).end3
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m8, [o(int16_perm)]
+ vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpbroadcastd m0, [o(pw_1697x16)]
+ vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
+ vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
+ vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
+ vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m9, m0, m1
+ pmulhrsw m10, m0, m2
+ pmulhrsw m11, m0, m3
+ pmulhrsw m12, m0, m4
+ pmulhrsw m13, m0, m5
+ pmulhrsw m14, m0, m6
+ pmulhrsw m15, m0, m7
+ pmulhrsw m0, m8
+ REPX {psraw x, 1}, m9, m10, m11, m12
+ pavgw m1, m9
+ pavgw m2, m10
+ pavgw m3, m11
+ pavgw m4, m12
+ REPX {psraw x, 1}, m13, m14, m15, m0
+ pavgw m5, m13
+ pavgw m6, m14
+ pavgw m7, m15
+ pavgw m8, m0
+ punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ jmp tx2q
+ALIGN function_align
+.pass2:
+ vpbroadcastd m11, [o(pw_1697x16)]
+ pmulhrsw m12, m11, m0
+ pmulhrsw m13, m11, m1
+ pmulhrsw m14, m11, m2
+ pmulhrsw m15, m11, m3
+ pmulhrsw m8, m11, m4
+ pmulhrsw m9, m11, m5
+ pmulhrsw m10, m11, m6
+ pmulhrsw m11, m7
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ paddsw m0, m12
+ paddsw m1, m13
+ paddsw m2, m14
+ paddsw m3, m15
+ paddsw m8, m4
+ movu m4, [o(permD+2)]
+ paddsw m9, m5
+ paddsw m6, m10
+ paddsw m7, m11
+ psrlq m12, m4, 4
+ mova m5, m4
+ mova m10, m4
+ mova m11, m4
+ vpermi2q m4, m0, m2 ; 8 9 12 13
+ vpermt2q m0, m12, m2 ; 0 1 4 5
+ vpermi2q m5, m1, m3 ; 10 11 14 15
+ vpermt2q m1, m12, m3 ; 2 3 6 7
+ vpermi2q m10, m8, m6
+ vpermt2q m8, m12, m6
+ vpermi2q m11, m9, m7
+ vpermt2q m9, m12, m7
+ jmp m(idct_16x16_internal_8bpc).end
+
+%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
+ vpbroadcastd m%4, [o(pw_%5_%6x8)]
+ punpcklwd m%1, m%3, m%3
+ pmulhrsw m%1, m%4
+ vpbroadcastd m%4, [o(pw_%7_%8x8)]
+ punpckhwd m%2, m%3, m%3
+ pmulhrsw m%2, m%4
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ cmp eobd, 107
+ jb .fast
+ mova m5, [cq+64*5]
+ mova m3, [cq+64*3]
+ mova m1, [cq+64*1]
+ mova m7, [cq+64*7]
+ mova m2, [cq+64*2]
+ mova m6, [cq+64*6]
+ mova m0, [cq+64*0]
+ mova m4, [cq+64*4]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m8, [o(idct_8x32p)]
+ vpbroadcastd m9, [o(pw_8192)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpckldq m8, m0, m1 ; ab
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3 ; cd
+ punpckhdq m2, m3
+ punpckldq m3, m4, m5 ; ef
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7 ; gh
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
+ punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9
+ punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21
+ punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13
+ punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17
+ punpcklqdq m20, m3, m5
+ punpckhqdq m16, m3, m5
+ punpcklqdq m19, m4, m6
+ punpckhqdq m17, m4, m6
+ vinserti32x4 ym8, ym18, xm20, 1
+ vshufi32x4 ym1, ym18, ym20, 0x03
+ vinserti32x4 ym9, ym14, xm16, 1
+ vshufi32x4 ym3, ym14, ym16, 0x03
+ vinserti32x4 ym0, ym21, xm19, 1
+ vshufi32x4 ym5, ym21, ym19, 0x03
+ vinserti32x4 ym7, ym15, xm17, 1
+ vshufi32x4 ym6, ym15, ym17, 0x03
+ call m(idct_8x16_internal_8bpc).main2
+ psrlq m12, [o(permB)], 60
+ vpermt2q m14, m12, m16
+ vpermt2q m21, m12, m19
+ vpermt2q m15, m12, m17
+ vpermi2q m12, m18, m20
+ vextracti32x8 ym16, m14, 1
+ vextracti32x8 ym19, m21, 1
+ vextracti32x8 ym17, m15, 1
+ vextracti32x8 ym20, m12, 1
+ call .main2
+ jmp .end
+.fast: ; right half is zero
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+64*4]
+ vinserti32x8 m2, [cq+64*0], 1
+ mova ym3, [cq+64*6]
+ vinserti32x8 m3, [cq+64*2], 1
+ mova ym4, [cq+64*3]
+ vinserti32x8 m4, [cq+64*5], 1
+ mova ym5, [cq+64*7]
+ vinserti32x8 m5, [cq+64*1], 1
+ REPX {vpermb x, m0, x}, m2, m3, m4, m5
+ call m(idct_16x8_internal_8bpc).main2
+ vbroadcasti32x4 m4, [o(int_shuf3)]
+ vbroadcasti32x4 m5, [o(int_shuf4)]
+ pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3
+ pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3
+ pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3
+ pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3
+ vpbroadcastd m4, [o(pw_8192)]
+ psrlq m5, [o(permB)], 60
+ punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
+ punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
+ punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
+ REPX {pmulhrsw x, m4}, m6, m17, m2, m16
+ vinserti32x4 ym0, ym2, xm6, 1 ; 0 2
+ vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6
+ vinserti32x4 ym14, ym16, xm17, 1 ; 1 3
+ vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7
+ vpermt2q m2, m5, m6 ; 8 10
+ vpermt2q m16, m5, m17 ; 9 11
+ vextracti32x8 ym3, m2, 1 ; 12 14
+ vextracti32x8 ym17, m16, 1 ; 13 15
+ call m(idct_8x16_internal_8bpc).main_fast
+ call .main_fast
+.end:
+ vpbroadcastd ym8, strided
+ pmulld ym8, [o(gather8d)]
+ call .main_end
+ lea r3, [dstq+strideq*4]
+ kxnorb k1, k1, k1
+ lea r4, [dstq+strideq*8]
+ pxor m9, m9
+ lea r1, [r3+strideq*8]
+ kmovb k2, k1
+ vpgatherdq m12{k1}, [r0+ym8]
+ kmovb k1, k2
+ vpgatherdq m13{k2}, [r3+ym8]
+ kmovb k2, k1
+ vpgatherdq m14{k1}, [r4+ym8]
+ kmovb k1, k2
+ vpgatherdq m15{k2}, [r1+ym8]
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m11, m12, m9
+ punpckhbw m12, m9
+ paddw m0, m11
+ paddw m1, m12
+ packuswb m0, m1
+ kmovb k2, k1
+ vpscatterdq [r0+ym8]{k1}, m0
+ punpcklbw m12, m13, m9
+ punpckhbw m13, m9
+ paddw m2, m12
+ paddw m3, m13
+ packuswb m2, m3
+ kmovb k1, k2
+ vpscatterdq [r3+ym8]{k2}, m2
+ punpcklbw m13, m14, m9
+ punpckhbw m14, m9
+ paddw m4, m13
+ paddw m5, m14
+ packuswb m4, m5
+ kmovb k2, k1
+ vpscatterdq [r4+ym8]{k1}, m4
+ punpcklbw m14, m15, m9
+ punpckhbw m15, m9
+ paddw m6, m14
+ paddw m7, m15
+ packuswb m6, m7
+ vpscatterdq [r1+ym8]{k2}, m6
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+INIT_YMM avx512icl
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ mova m11, m12
+ mova m17, m20
+ mova m15, m21
+ mova m16, m14
+ jmp .main4
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main3
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m12, m21, m14 ; in31 in1
+ punpckhwd m14, m21 ; in3 in29
+ punpcklwd m21, m20, m15 ; in27 in5
+ punpckhwd m15, m20 ; in7 in25
+ punpcklwd m20, m19, m16 ; in23 in9
+ punpckhwd m16, m19 ; in11 in21
+ punpcklwd m19, m18, m17 ; in19 in13
+ punpckhwd m17, m18 ; in15 in17
+.main2:
+ ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+ ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+.main3:
+ psubsw m11, m12, m17 ; t17 t30
+ paddsw m12, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m21, m16 ; t21 t26
+ paddsw m21, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main4:
+ ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m8, [o(pw_m3784_1567)]
+ psubsw m19, m12, m20 ; t19a t28a
+ paddsw m20, m12 ; t16a t31a
+ psubsw m12, m14, m21 ; t20a t27a
+ paddsw m14, m21 ; t23a t24a
+ psubsw m21, m11, m17 ; t18 t29
+ paddsw m11, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m16, m15 ; t22 t25
+ ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28
+ ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m18, [o(deint_shuf)]
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m9, [o(pw_2896_2896)]
+ psubsw m15, m20, m14 ; t23 t24
+ paddsw m20, m14 ; t16 t31
+ psubsw m14, m11, m16 ; t22a t25a
+ paddsw m11, m16 ; t17a t30a
+ psubsw m16, m21, m17 ; t21 t26
+ paddsw m21, m17 ; t18 t29
+ psubsw m17, m19, m12 ; t20a t27a
+ paddsw m19, m12 ; t19a t28a
+ REPX {pshufb x, m18}, m20, m11, m21, m19
+ ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a
+ ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25
+ packssdw m18, m13 ; t23a t22
+ packssdw m12, m15 ; t24a t25
+ ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a
+ ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27
+ packssdw m16, m13 ; t20 t21a
+ packssdw m14, m15 ; t27 t26a
+ punpcklqdq m13, m19, m21 ; t19a t18
+ punpckhqdq m19, m21 ; t28a t29
+ punpcklqdq m21, m20, m11 ; t16 t17a
+ punpckhqdq m20, m11 ; t31 t30a
+INIT_ZMM avx512icl
+ mova m15, [o(permA)]
+ ret
+cglobal_label .main_end
+ vpbroadcastd m10, [o(pw_2048)]
+ vpermt2q m0, m15, m1 ; t0 t1 t2 t3
+ vpermt2q m20, m15, m19 ; t31 t30a t29 t28a
+ vpermt2q m2, m15, m3 ; t4 t5 t6 t7
+ vpermt2q m14, m15, m12 ; t27 t26a t25 t24a
+ vpermt2q m4, m15, m5 ; t8 t9 t10 t11
+ vpermt2q m18, m15, m16 ; t23a t22 t21a t20
+ vpermt2q m6, m15, m7 ; t12 t13 t14 t15
+ vpermt2q m13, m15, m21 ; t19a t18 t17a t16
+ psubsw m7, m0, m20 ; out31 out30 out29 out28
+ paddsw m0, m20 ; out0 out1 out2 out3
+ psubsw m5, m2, m14 ; out27 out26 out25 out24
+ paddsw m2, m14 ; out4 out5 out6 out7
+ psubsw m3, m4, m18 ; out23 out22 out21 out20
+ paddsw m4, m18 ; out8 out9 out10 out11
+ psubsw m1, m6, m13 ; out19 out18 out17 out16
+ paddsw m6, m13 ; out12 out13 out14 out15
+ vzeroupper
+ ret
+
+%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
+ vbroadcasti32x4 ym%1, [cq+16*%2]
+ vbroadcasti32x4 ym8, [cq+16*%3]
+ shufpd ym%1, ym8, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ test eobd, eobd
+ jz .dconly
+ lea r5, [o_base]
+ LOAD_PACKED_16X2 0, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 1, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 2, 8, 10 ; in8 in10
+ LOAD_PACKED_16X2 3, 12, 14 ; in12 in14
+ LOAD_PACKED_16X2 14, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 15, 5, 7 ; in5 in7
+ LOAD_PACKED_16X2 16, 9, 11 ; in9 in11
+ LOAD_PACKED_16X2 17, 13, 15 ; in13 in15
+ pxor m4, m4
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ cmp eobd, 107
+ jb .fast
+ LOAD_PACKED_16X2 4, 16, 18 ; in16 in18
+ LOAD_PACKED_16X2 5, 20, 22 ; in20 in22
+ LOAD_PACKED_16X2 6, 24, 26 ; in24 in26
+ LOAD_PACKED_16X2 7, 28, 30 ; in28 in30
+ call m(idct_8x16_internal_8bpc).main
+ LOAD_PACKED_16X2 18, 19, 17 ; in19 in17
+ LOAD_PACKED_16X2 19, 23, 21 ; in23 in21
+ LOAD_PACKED_16X2 20, 27, 25 ; in27 in25
+ LOAD_PACKED_16X2 21, 31, 29 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .pass2
+.fast: ; bottom half is zero
+ mova ym5, ym4
+ mova ym6, ym4
+ mova ym7, ym4
+ call m(idct_8x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_8192)]
+ vpermt2q m0, m15, m4 ; t0 t1 t9 t8
+ vpermt2q m20, m15, m18 ; t31 t30a t23a t22
+ vpermt2q m3, m15, m7 ; t7 t6 t14 t15
+ vpermt2q m12, m15, m21 ; t25 t24a t17a t16
+ vpermt2q m2, m15, m6 ; t4 t5 t13 t12
+ vpermt2q m14, m15, m13 ; t23a t22 t21a t20
+ vpermt2q m1, m15, m5 ; t3 t2 t10 t11
+ vpermt2q m19, m15, m16 ; t27 t26a t19a t18
+ psubsw m8, m0, m20 ; out31 out30 out22 out23
+ paddsw m0, m20 ; out0 out1 out9 out8
+ paddsw m6, m3, m12 ; out7 out6 out14 out15
+ psubsw m3, m12 ; out24 out25 out17 out16
+ psubsw m5, m2, m14 ; out27 out26 out18 out19
+ paddsw m4, m2, m14 ; out4 out5 out13 out12
+ psubsw m7, m1, m19 ; out28 out29 out21 out20
+ paddsw m2, m1, m19 ; out3 out2 out10 out11
+ vzeroupper
+ vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25
+ vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24
+ vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27
+ vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26
+ vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29
+ vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28
+ vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31
+ vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ lea r3, [dstq+strideq*4]
+ movshdup m12, [o(permD)]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ paddw m4, m8
+ paddw m5, m9
+ paddw m6, m10
+ paddw m7, m11
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m12, m0
+ vpermq m2, m12, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ packuswb m4, m5
+ packuswb m6, m7
+ vpermq m4, m12, m4
+ vpermq m6, m12, m6
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly3:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova ym1, [dstq+strideq*0]
+ vinserti32x8 m1, [dstq+strideq*1], 1
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0
+.main3:
+ paddsw m8, m1, m5 ; t4
+ psubsw m1, m5 ; t5a
+ paddsw m9, m7, m3 ; t7
+ psubsw m7, m3 ; t6a
+ ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6
+ psubsw m5, m0, m2 ; dct4 out2
+ paddsw m2, m0 ; dct4 out1
+ paddsw m0, m4, m6 ; dct4 out0
+ psubsw m4, m6 ; dct4 out3
+ psubsw m6, m2, m1 ; out6
+ paddsw m1, m2 ; out1
+ paddsw m2, m5, m7 ; out2
+ psubsw m5, m7 ; out5
+ psubsw m7, m0, m9 ; out7
+ paddsw m0, m9 ; out0
+ paddsw m3, m4, m8 ; out3
+ psubsw m4, m8 ; out4
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m7, [pw_5]
+ paddsw m0, m7, [cq+64*0]
+ paddsw m1, m7, [cq+64*1]
+ vpbroadcastd ym9, strided
+ paddsw m2, m7, [cq+64*2]
+ paddsw m3, m7, [cq+64*3]
+ paddsw m4, m7, [cq+64*4]
+ paddsw m5, m7, [cq+64*5]
+ paddsw m6, m7, [cq+64*6]
+ paddsw m7, [cq+64*7]
+ pmulld ym14, ym9, [pd_0to15]
+ lea r3, [dstq+strideq*1]
+ lea r4, [dstq+strideq*2]
+ kxnorb k1, k1, k1
+ pxor m13, m13
+ add r1, r4 ; dstq+strideq*3
+ kmovb k2, k1
+ vpgatherdq m9{k1}, [r0+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m10{k2}, [r3+ym14*4]
+ kmovb k2, k1
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpgatherdq m11{k1}, [r4+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m12{k2}, [r1+ym14*4]
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m8, m9, m13 ; 0 8 16 24
+ punpckhbw m9, m13 ; 4 12 20 28
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ kmovb k2, k1
+ vpscatterdq [r0+ym14*4]{k1}, m0
+ punpcklbw m8, m10, m13 ; 1 9 17 25
+ punpckhbw m10, m13 ; 5 13 21 29
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ kmovb k1, k2
+ vpscatterdq [r3+ym14*4]{k2}, m1
+ punpcklbw m8, m11, m13 ; 2 10 18 26
+ punpckhbw m11, m13 ; 6 14 22 30
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ kmovb k2, k1
+ vpscatterdq [r4+ym14*4]{k1}, m2
+ punpcklbw m8, m12, m13 ; 3 11 19 27
+ punpckhbw m12, m13 ; 7 15 23 31
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ vpscatterdq [r1+ym14*4]{k2}, m3
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m0, [pw_4096]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*4]
+ pmulhrsw m6, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*5]
+ pmulhrsw m7, m0, [cq+64*2]
+ pmulhrsw m2, m0, [cq+64*6]
+ pmulhrsw m8, m0, [cq+64*3]
+ pmulhrsw m0, [cq+64*7]
+ mova m13, [int8_permA]
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ punpckldq m1, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m6, m5
+ punpckhdq m6, m5
+ punpckldq m5, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m0
+ punpckhdq m8, m0
+ mova ym9, [dstq+strideq*0]
+ vinserti32x8 m9, [dstq+strideq*2], 1
+ mova ym10, [dstq+strideq*1]
+ vinserti32x8 m10, [dstq+r3 ], 1
+ mova ym11, [r4+strideq*0]
+ vinserti32x8 m11, [r4+strideq*2], 1
+ mova ym12, [r4+strideq*1]
+ vinserti32x8 m12, [r4+r3 ], 1
+ REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
+ pxor m13, m13
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklqdq m0, m1, m4 ; a0 a2 c0 c2
+ punpckhqdq m1, m4 ; b0 b2 d0 d2
+ punpcklqdq m4, m5, m2 ; a1 a3 c1 c3
+ punpckhqdq m5, m2 ; b1 b3 d1 d3
+ punpcklqdq m2, m3, m6 ; e0 e2 g0 g2
+ punpckhqdq m3, m6 ; f0 f2 h0 h2
+ punpcklqdq m6, m7, m8 ; e1 e3 g1 g3
+ punpckhqdq m7, m8 ; f1 f3 h1 h3
+ punpcklbw m8, m9, m13
+ punpckhbw m9, m13
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*2], m0, 1
+ punpcklbw m8, m10, m13
+ punpckhbw m10, m13
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r3 ], m1, 1
+ punpcklbw m8, m11, m13
+ punpckhbw m11, m13
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ mova [r4+strideq*0], ym2
+ vextracti32x8 [r4+strideq*2], m2, 1
+ punpcklbw m8, m12, m13
+ punpckhbw m12, m13
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ mova [r4+strideq*1], ym3
+ vextracti32x8 [r4+r3 ], m3, 1
+ RET
+
+%macro IDCT_16x32_END 3 ; src[1-2], row
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ mova xm9, [dstq+r3 ]
+ vinserti32x4 ym9, [dstq+strideq*2], 1
+ pmulhrsw m%1, m10
+ pmulhrsw m%2, m10
+ vpermb m8, m11, m8
+ vpermb m9, m11, m9
+ mova [cq+64*(%3*2+0)], m13
+ mova [cq+64*(%3*2+1)], m13
+ paddw m8, m%1
+ paddw m9, m%2
+ packuswb m8, m9
+ vpermd m8, m12, m8
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r3 ], m8, 3
+%if %1 != 20
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m15, [o(pw_2896x8)]
+ cmp eobd, 151
+ jb .fast
+ pmulhrsw m5, m15, [cq+64*10]
+ pmulhrsw m3, m15, [cq+64* 6]
+ pmulhrsw m1, m15, [cq+64* 2]
+ pmulhrsw m7, m15, [cq+64*14]
+ pmulhrsw m2, m15, [cq+64* 4]
+ pmulhrsw m6, m15, [cq+64*12]
+ pmulhrsw m0, m15, [cq+64* 0]
+ pmulhrsw m4, m15, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m15, [cq+64* 1]
+ pmulhrsw m21, m15, [cq+64*15]
+ pmulhrsw m18, m15, [cq+64* 9]
+ pmulhrsw m17, m15, [cq+64* 7]
+ pmulhrsw m16, m15, [cq+64* 5]
+ pmulhrsw m19, m15, [cq+64*11]
+ pmulhrsw m20, m15, [cq+64*13]
+ pmulhrsw m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m8, [o(idct_16x32p)]
+ vpbroadcastd m9, [o(pw_16384)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m14, m15, m16, m17, m18, m19, m20, m21
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2
+ punpckldq m3, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m3, m4, m5, m6
+ punpckldq m7, m14, m15
+ punpckhdq m14, m15
+ punpckldq m15, m16, m17
+ punpckhdq m16, m17
+ REPX {pmulhrsw x, m9}, m7, m14, m15, m16
+ punpckldq m17, m18, m19
+ punpckhdq m18, m19
+ punpckldq m19, m20, m21
+ punpckhdq m20, m21
+ REPX {pmulhrsw x, m9}, m17, m18, m19, m20
+ punpcklqdq m21, m8, m1
+ punpckhqdq m8, m1
+ punpcklqdq m1, m0, m2
+ punpckhqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ punpcklqdq m5, m4, m6
+ punpckhqdq m4, m6
+ punpcklqdq m6, m7, m15
+ punpckhqdq m7, m15
+ punpcklqdq m15, m14, m16
+ punpckhqdq m14, m16
+ punpcklqdq m16, m17, m19
+ punpckhqdq m17, m19
+ punpcklqdq m19, m18, m20
+ punpckhqdq m18, m20
+ vinserti32x8 m20, m21, ym2, 1
+ vshufi32x4 m21, m2, q3232
+ vinserti32x8 m2, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m1, ym5, 1
+ vshufi32x4 m1, m5, q3232
+ vinserti32x8 m5, m0, ym4, 1
+ vshufi32x4 m0, m4, q3232
+ vinserti32x8 m4, m6, ym16, 1
+ vshufi32x4 m6, m16, q3232
+ vinserti32x8 m16, m7, ym17, 1
+ vshufi32x4 m7, m17, q3232
+ vinserti32x8 m17, m15, ym19, 1
+ vshufi32x4 m15, m19, q3232
+ vinserti32x8 m19, m14, ym18, 1
+ vshufi32x4 m14, m18, q3232
+ vshufi32x4 m18, m21, m6, q3131 ; 27 5
+ vshufi32x4 m21, m6, q2020 ; 31 1
+ vshufi32x4 m6, m8, m7, q2020 ; 24 8
+ vshufi32x4 m8, m7, q3131 ; 30 2
+ vshufi32x4 m7, m1, m15, q2020 ; 28 4
+ vshufi32x4 m1, m15, q3131 ; 6 26
+ vshufi32x4 m15, m0, m14, q2020 ; 7 25
+ vshufi32x4 m0, m14, q3131 ; 14 18
+ vshufi32x4 m14, m20, m4, q2020 ; 3 29
+ vshufi32x4 m20, m4, q3131 ; 23 9
+ vshufi32x4 m9, m3, m17, q2020 ; 16 0
+ vshufi32x4 m3, m17, q3131 ; 12 20
+ vshufi32x4 m17, m5, m19, q2020 ; 15 17
+ vshufi32x4 m5, m19, q3131 ; 22 10
+ vshufi32x4 m19, m2, m16, q2020 ; 19 13
+ vshufi32x4 m16, m2, m16, q3131 ; 11 21
+ call m(idct_16x16_internal_8bpc).main3
+ call .main_oddhalf
+ jmp .pass2
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_16384)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m17, m0, m1
+ punpckldq m0, m1
+ punpckhdq m16, m2, m3
+ punpckldq m2, m3
+ punpckhdq m18, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m1, m0, ym2, 1
+ vshufi32x4 m3, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vinserti32x8 m15, m17, ym16, 1
+ vshufi32x4 m17, m16, q3232
+ vinserti32x8 m16, m18, ym5, 1
+ vshufi32x4 m18, m5, q3232
+ vshufi32x4 m0, m1, m2, q2020 ; 0 2
+ vshufi32x4 m1, m2, q3131 ; 4 6
+ vshufi32x4 m2, m3, m4, q2020 ; 8 10
+ vshufi32x4 m3, m4, q3131 ; 12 14
+ vshufi32x4 m14, m15, m16, q2020 ; 1 3
+ vshufi32x4 m15, m16, q3131 ; 5 7
+ vshufi32x4 m16, m17, m18, q2020 ; 9 11
+ vshufi32x4 m17, m18, q3131 ; 13 15
+ pxor m6, m6
+ punpckhwd m8, m0, m0
+ punpcklwd m9, m6, m0
+ punpckhwd m0, m3, m3
+ punpckhwd m5, m2, m2
+ punpcklwd m7, m1, m1
+ punpckhwd m1, m1
+ punpcklwd m3, m3
+ punpcklwd m6, m2
+ call m(idct_16x16_internal_8bpc).main_fast5
+ punpcklwd m21, m14, m14
+ punpckhwd m14, m14
+ punpcklwd m18, m15, m15
+ punpckhwd m15, m15
+ punpcklwd m20, m16, m16
+ punpckhwd m16, m16
+ punpcklwd m19, m17, m17
+ punpckhwd m17, m17
+ call .main_oddhalf_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_2048)]
+ mova m11, [o(end_16x32p)]
+ lea r3, [strideq*3]
+ pxor m13, m13
+ psrld m12, m11, 8
+ IDCT_16x32_END 0, 1, 0
+ IDCT_16x32_END 2, 3, 1
+ IDCT_16x32_END 4, 5, 2
+ IDCT_16x32_END 6, 7, 3
+ IDCT_16x32_END 14, 15, 4
+ IDCT_16x32_END 16, 17, 5
+ IDCT_16x32_END 18, 19, 6
+ IDCT_16x32_END 20, 21, 7
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m20, [o(pw_m1380_3857x8)]
+ vpbroadcastd m9, [o(pw_995_3973x8)]
+ vpbroadcastd m16, [o(pw_m601_4052x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ pmulhrsw m20, m15 ; t19a, t28a
+ pmulhrsw m18, m9 ; t20a, t27a
+ pmulhrsw m14, m16 ; t23a, t24a
+ mova m8, m21
+ mova m17, m20
+ mova m15, m18
+ mova m16, m14
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m9, [o(pw_m2751_3035x8)]
+ vpbroadcastd m11, [o(pw_1751_3703x8)]
+ vpbroadcastd m12, [o(pw_m1380_3857x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ vpbroadcastd m8, [o(pw_995_3973x8)]
+ pmulhrsw m17, m9 ; t17a, t30a
+ vpbroadcastd m9, [o(pw_m2106_3513x8)]
+ pmulhrsw m20, m11 ; t18a, t29a
+ vpbroadcastd m11, [o(pw_2440_3290x8)]
+ pmulhrsw m15, m12 ; t19a, t28a
+ vpbroadcastd m12, [o(pw_m601_4052x8)]
+ pmulhrsw m18, m8 ; t20a, t27a
+ pmulhrsw m16, m9 ; t21a, t26a
+ pmulhrsw m19, m11 ; t22a, t25a
+ pmulhrsw m14, m12 ; t23a, t24a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+.main2:
+ psubsw m8, m21, m17 ; t17 t30
+ paddsw m21, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m18, m16 ; t21 t26
+ paddsw m18, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main3:
+ ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ psubsw m19, m21, m20 ; t19a t28a
+ paddsw m21, m20 ; t16a t31a
+ psubsw m20, m14, m18 ; t20a t27a
+ paddsw m14, m18 ; t23a t24a
+ psubsw m18, m8, m17 ; t18 t29
+ paddsw m8, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m15, m16 ; t22 t25
+ ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28
+ ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m9, [o(deint_shuf)]
+ psubsw m16, m21, m14 ; t23 t24
+ paddsw m14, m21 ; t16 t31
+ psubsw m21, m8, m15 ; t22a t25a
+ paddsw m15, m8 ; t17a t30a
+ psubsw m8, m18, m17 ; t21 t26
+ paddsw m18, m17 ; t18 t29
+ paddsw m17, m19, m20 ; t19a t28a
+ psubsw m19, m20 ; t20a t27a
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ REPX {pshufb x, m9}, m14, m15, m18, m17
+ mova m9, m10
+ vpdpwssd m9, m16, m11
+ mova m20, m10
+ vpdpwssd m20, m21, m11
+ psrad m9, 12
+ psrad m20, 12
+ packssdw m9, m20 ; t23a t22
+ mova m20, m10
+ vpdpwssd m20, m16, m12
+ mova m16, m10
+ vpdpwssd m16, m21, m12
+ psrad m20, 12
+ psrad m16, 12
+ packssdw m16, m20, m16 ; t24a t25
+ ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a
+ ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27
+ packssdw m11, m20 ; t27 t26a
+ packssdw m8, m21 ; t20 t21a
+ punpcklqdq m20, m14, m15 ; t16 t17a
+ punpckhqdq m14, m15 ; t31 t30a
+ punpckhqdq m15, m17, m18 ; t28a t29
+ punpcklqdq m17, m18 ; t19a t18
+ psubsw m21, m0, m14 ; out31 out30
+ paddsw m0, m14 ; out0 out1
+ psubsw m14, m7, m20 ; out16 out17
+ paddsw m7, m20 ; out15 out14
+ psubsw m20, m1, m15 ; out28 out29
+ paddsw m1, m15 ; out3 out2
+ psubsw m15, m6, m17 ; out19 out18
+ paddsw m6, m17 ; out12 out13
+ psubsw m17, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m18, m3, m16 ; out24 out25
+ paddsw m3, m16 ; out7 out6
+ psubsw m16, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m19, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m21, [o(permB)]
+ vpermq m1, m21, [cq+64* 0] ; 0 1
+ vpermq m14, m21, [cq+64* 1] ; 2 3
+ vpermq m20, m21, [cq+64* 2] ; 4 5
+ vpermq m15, m21, [cq+64* 3] ; 6 7
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m2, m21, [cq+64* 4] ; 8 9
+ vpermq m16, m21, [cq+64* 5] ; 10 11
+ vpermq m3, m21, [cq+64* 6] ; 12 13
+ vpermq m17, m21, [cq+64* 7] ; 14 15
+ REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
+ pxor m12, m12
+ REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
+ cmp eobd, 151
+ jb .fast
+ vpermq m9, m21, [cq+64* 8] ; 16 17
+ vpermq m19, m21, [cq+64* 9] ; 18 19
+ vpermq m4, m21, [cq+64*10] ; 20 21
+ vpermq m5, m21, [cq+64*11] ; 22 23
+ vpermq m6, m21, [cq+64*12] ; 24 25
+ vpermq m18, m21, [cq+64*13] ; 26 27
+ vpermq m7, m21, [cq+64*14] ; 28 29
+ vpermq m21, m21, [cq+64*15] ; 30 31
+ REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
+ REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
+ punpcklwd m8, m21, m14 ; 30 2
+ punpckhwd m21, m1 ; 31 1
+ punpcklwd m0, m17, m19 ; 14 18
+ punpckhwd m17, m9 ; 15 17
+ punpcklwd m9, m1 ; 16 0
+ punpckhwd m14, m7 ; 3 29
+ punpcklwd m1, m15, m18 ; 6 26
+ punpckhwd m15, m6 ; 7 25
+ punpcklwd m6, m2 ; 24 8
+ punpckhwd m19, m3 ; 19 13
+ punpcklwd m3, m4 ; 12 20
+ punpckhwd m18, m20 ; 27 5
+ punpcklwd m7, m20 ; 28 4
+ punpckhwd m20, m5, m2 ; 23 9
+ punpcklwd m5, m16 ; 22 10
+ punpckhwd m16, m4 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ jmp .pass2
+.fast: ; bottom half zero
+ punpcklwd m8, m14, m14 ; 2
+ punpcklwd m0, m17, m17 ; 14
+ punpcklwd m5, m16, m16 ; 10
+ punpcklwd m9, m12, m1 ; __ 0
+ punpckhwd m21, m1, m1 ; 1
+ punpcklwd m1, m15, m15 ; 6
+ punpcklwd m7, m20, m20 ; 4
+ punpckhwd m19, m3, m3 ; 13
+ punpcklwd m3, m3 ; 12
+ punpcklwd m6, m12, m2 ; __ 8
+ punpckhwd m18, m20, m20 ; 5
+ punpckhwd m20, m2, m2 ; 9
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpckhwd m15, m15 ; 7
+ punpckhwd m14, m14 ; 3
+ punpckhwd m16, m16 ; 11
+ punpckhwd m17, m17 ; 15
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+.pass2:
+ vpbroadcastd m9, [o(pw_16384)]
+ call .transpose_round
+ vshufi32x4 m16, m14, m2, q3131 ; 5
+ vshufi32x4 m14, m2, q2020 ; 1
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m18, q2020 ; 2
+ vshufi32x4 m18, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m6, m21, m4, q3131 ; 12
+ vshufi32x4 m4, m21, m4, q2020 ; 8
+ vshufi32x4 m21, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m7, m5, m15, q3131 ; 14
+ vshufi32x4 m5, m15, q2020 ; 10
+ vshufi32x4 m15, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ call .main_oddhalf
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r2, [strideq*3]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3
+ lea r3, [dstq+strideq*4]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ REPX {pmulhrsw x, m12}, m4, m5, m6, m7
+ lea r4, [dstq+strideq*8]
+ packuswb m0, m1
+ paddw m4, m8
+ paddw m5, m9
+ packuswb m2, m3
+ paddw m6, m10
+ paddw m7, m11
+ pmovzxbw m8, [r4+strideq*0]
+ pmovzxbw m9, [r4+strideq*1]
+ pmovzxbw m10, [r4+strideq*2]
+ pmovzxbw m11, [r4+r2 ]
+ REPX {pmulhrsw x, m12}, m14, m15, m16, m17
+ lea r5, [r3+strideq*8]
+ packuswb m4, m5
+ paddw m14, m8
+ paddw m15, m9
+ packuswb m6, m7
+ paddw m16, m10
+ paddw m17, m11
+ pmovzxbw m8, [r5+strideq*0]
+ pmovzxbw m9, [r5+strideq*1]
+ pmovzxbw m10, [r5+strideq*2]
+ pmovzxbw m11, [r5+r2 ]
+ REPX {pmulhrsw x, m12}, m18, m19, m20, m21
+ packuswb m14, m15
+ paddw m18, m8
+ paddw m19, m9
+ packuswb m16, m17
+ paddw m20, m10
+ paddw m21, m11
+ packuswb m18, m19
+ packuswb m20, m21
+ REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ mova [r4+strideq*0], ym14
+ vextracti32x8 [r4+strideq*1], m14, 1
+ mova [r4+strideq*2], ym16
+ vextracti32x8 [r4+r2 ], m16, 1
+ mova [r5+strideq*0], ym18
+ vextracti32x8 [r5+strideq*1], m18, 1
+ mova [r5+strideq*2], ym20
+ vextracti32x8 [r5+r2 ], m20, 1
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m2, [o(pw_4017x8)]
+ vpbroadcastd m3, [o(pw_799x8)]
+ vpbroadcastd m18, [o(pw_4076x8)]
+ vpbroadcastd m19, [o(pw_401x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m16, [o(pw_3920x8)]
+ pmulhrsw m9, m0 ; t0
+ pmulhrsw m2, m1 ; t7a
+ pmulhrsw m1, m3 ; t4a
+ pmulhrsw m18, m14 ; t15a
+ pmulhrsw m14, m19 ; t8a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m16 ; t12a
+ psubsw m7, m9, m2 ; idct8 out7
+ paddsw m0, m9, m2 ; idct8 out0
+ psubsw m4, m9, m1 ; idct8 out4
+ paddsw m3, m9, m1 ; idct8 out3
+ ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
+ mova m21, m18
+ mova m19, m14
+ mova m16, m15
+ mova m8, m20
+ psubsw m6, m9, m1 ; idct8 out6
+ paddsw m1, m9 ; idct8 out1
+ psubsw m5, m9, m2 ; idct8 out5
+ paddsw m2, m9 ; idct8 out2
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m5, [o(pw_m2276x8)]
+ vpbroadcastd m11, [o(pw_3406x8)]
+ vpbroadcastd m7, [o(pw_4017x8)]
+ vpbroadcastd m12, [o(pw_799x8)]
+ vpbroadcastd m6, [o(pw_3784x8)]
+ vpbroadcastd m10, [o(pw_1567x8)]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m5, m3 ; t5a
+ pmulhrsw m3, m11 ; t6a
+ pmulhrsw m7, m1 ; t7a
+ pmulhrsw m1, m12 ; t4a
+ pmulhrsw m6, m2 ; t3
+ pmulhrsw m2, m10 ; t2
+ pmulhrsw m4, m0 ; t0
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m10, [o(pd_2048)]
+ mova m0, m4 ; t1
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
+ vpbroadcastd m21, [o(pw_4076x8)]
+ vpbroadcastd m8, [o(pw_401x8)]
+ vpbroadcastd m18, [o(pw_m2598x8)]
+ vpbroadcastd m9, [o(pw_3166x8)]
+ vpbroadcastd m19, [o(pw_3612x8)]
+ vpbroadcastd m11, [o(pw_1931x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m12, [o(pw_3920x8)]
+ pmulhrsw m21, m14 ; t15a
+ pmulhrsw m14, m8 ; t8a
+ pmulhrsw m18, m17 ; t9a
+ pmulhrsw m17, m9 ; t14a
+ pmulhrsw m19, m16 ; t13a
+ pmulhrsw m16, m11 ; t10a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m12 ; t12a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
+.main2:
+ paddsw m8, m20, m16 ; t11
+ psubsw m20, m16 ; t10
+ paddsw m16, m15, m19 ; t12
+ psubsw m15, m19 ; t13
+ psubsw m19, m14, m18 ; t9
+ paddsw m14, m18 ; t8
+ psubsw m18, m21, m17 ; t14
+ paddsw m21, m17 ; t15
+.main3:
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ psubsw m17, m14, m8 ; t11a
+ paddsw m8, m14 ; t8a
+ paddsw m14, m18, m15 ; t9
+ psubsw m18, m15 ; t10
+ psubsw m15, m19, m20 ; t13
+ paddsw m19, m20 ; t14
+ paddsw m20, m21, m16 ; t15a
+ psubsw m16, m21, m16 ; t12a
+ ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
+ ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12
+ psubsw m21, m0, m20 ; out15
+ paddsw m0, m20 ; out0
+ psubsw m20, m1, m19 ; out14
+ paddsw m1, m19 ; out1
+ psubsw m19, m2, m18 ; out13
+ paddsw m2, m18 ; out2
+ psubsw m18, m3, m17 ; out12
+ paddsw m3, m17 ; out3
+ psubsw m17, m4, m16 ; out11
+ paddsw m4, m16 ; out4
+ psubsw m16, m5, m15 ; out10
+ paddsw m5, m15 ; out5
+ psubsw m15, m6, m14 ; out9
+ paddsw m6, m14 ; out6
+ psubsw m14, m7, m8 ; out8
+ paddsw m7, m8 ; out7
+ ret
+.transpose_round:
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m14, m16
+ punpckhwd m14, m16
+ punpcklwd m16, m15, m17
+ punpckhwd m15, m17
+ punpcklwd m17, m19, m21
+ punpckhwd m19, m21
+ punpckhwd m21, m18, m20
+ punpcklwd m18, m20
+ punpcklwd m20, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ REPX {pmulhrsw x, m9}, m20, m8, m1, m0
+ punpcklwd m6, m7, m15
+ punpckhwd m7, m15
+ punpcklwd m15, m14, m16
+ punpckhwd m14, m16
+ REPX {pmulhrsw x, m9}, m2, m3, m5, m4
+ punpckhwd m16, m18, m19
+ punpcklwd m18, m19
+ punpcklwd m19, m21, m17
+ punpckhwd m21, m17
+ REPX {pmulhrsw x, m9}, m6, m7, m15, m14
+ punpcklwd m17, m8, m0 ; a2 a6 aa ae
+ punpckhwd m8, m0 ; a3 a7 ab af
+ punpcklwd m0, m20, m1 ; a0 a4 a8 ac
+ punpckhwd m20, m1 ; a1 a5 a9 ad
+ REPX {pmulhrsw x, m9}, m16, m18, m19, m21
+ punpcklwd m1, m2, m5 ; b0 b4 b8 bc
+ punpckhwd m2, m5 ; b1 b5 b9 bd
+ punpcklwd m5, m3, m4 ; b2 b6 ba be
+ punpckhwd m3, m4 ; b3 b7 bb bf
+ punpcklwd m4, m6, m15 ; c0 c4 c8 cc
+ punpckhwd m6, m15 ; c1 c5 c9 cd
+ punpcklwd m15, m7, m14 ; c2 c6 ca ce
+ punpckhwd m7, m14 ; c3 c7 cb cf
+ punpcklwd m14, m18, m19 ; d0 d4 d8 dc
+ punpckhwd m18, m19 ; d1 d5 d9 dd
+ punpcklwd m9, m16, m21 ; d2 d6 da de
+ punpckhwd m16, m21 ; d3 d7 db df
+ vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc
+ vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4
+ vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6
+ vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be
+ vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7
+ vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf
+ vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4
+ vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc
+ vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5
+ vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd
+ vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5
+ vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd
+ vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6
+ vshufi32x4 m15, m9, q3232 ; ca ce da de
+ vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7
+ vshufi32x4 m7, m16, q3232 ; cb cf db df
+ ret
+
+%macro IDTX_16x32 4 ; src/dst[1-4]
+ pmulhrsw m%1, m15, [cq+64*%1]
+ pmulhrsw m%2, m15, [cq+64*%2]
+ pmulhrsw m%3, m15, [cq+64*%3]
+ pmulhrsw m%4, m15, [cq+64*%4]
+ pmulhrsw m18, m16, m%1
+ pmulhrsw m19, m16, m%2
+ pmulhrsw m20, m16, m%3
+ pmulhrsw m21, m16, m%4
+ REPX {pmulhrsw x, m17}, m18, m19, m20, m21
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_16x32_STORE 2 ; src[1-2]
+ mova xm17, [dstq+r3*0]
+ vinserti128 ym17, [dstq+r3*4], 1
+ vinserti32x4 m17, [dstq+r3*8], 2
+ vinserti32x4 m17, [dstq+r4*8], 3
+ mova [cq+64*(%1*2+0)], m18
+ mova [cq+64*(%1*2+1)], m18
+ punpcklbw m16, m17, m18
+ punpckhbw m17, m18
+ paddw m16, m%1
+ paddw m17, m%2
+ packuswb m16, m17
+ mova [dstq+r3*0], xm16
+ vextracti128 [dstq+r3*4], ym16, 1
+ vextracti32x4 [dstq+r3*8], m16, 2
+ vextracti32x4 [dstq+r4*8], m16, 3
+%if %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
+ vpbroadcastd m15, [pw_2896x8]
+ vpbroadcastd m16, [pw_1697x16]
+ vpbroadcastd m17, [pw_16384]
+ IDTX_16x32 0, 1, 2, 3
+ IDTX_16x32 4, 5, 6, 7
+ IDTX_16x32 8, 9, 10, 11
+ IDTX_16x32 12, 13, 14, 15
+ vpbroadcastd m16, [pw_8192]
+ call .transpose_2x8x8_round
+ lea r3, [strideq*2]
+ lea r4, [strideq*3]
+ pxor m18, m18
+ IDTX_16x32_STORE 0, 8
+ IDTX_16x32_STORE 1, 9
+ IDTX_16x32_STORE 2, 10
+ IDTX_16x32_STORE 3, 11
+ IDTX_16x32_STORE 4, 12
+ IDTX_16x32_STORE 5, 13
+ IDTX_16x32_STORE 6, 14
+ IDTX_16x32_STORE 7, 15
+ RET
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m17, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m17, m1
+ punpckhdq m17, m1
+ REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m17
+ punpcklqdq m6, m17
+ punpckhwd m17, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m17, m9
+ punpckhdq m17, m9
+ REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m17
+ punpcklqdq m14, m17
+ ret
+
+%macro IDTX_32x16 4 ; dst[1-4]
+ pmulhrsw m%2, m12, [cq+32*(%1+ 0)]
+ pmulhrsw m18, m12, [cq+32*(%1+16)]
+ pmulhrsw m%4, m12, [cq+32*(%3+ 0)]
+ pmulhrsw m19, m12, [cq+32*(%3+16)]
+ REPX {paddsw x, x}, m%2, m18, m%4, m19
+ mova m%1, m14
+ vpermi2q m%1, m%2, m18
+ vpermt2q m%2, m16, m18
+%if %3 != 14
+ mova m%3, m14
+%endif
+ vpermi2q m%3, m%4, m19
+ vpermt2q m%4, m16, m19
+ pmulhrsw m18, m17, m%1
+ pmulhrsw m19, m17, m%2
+ pmulhrsw m20, m17, m%3
+ pmulhrsw m21, m17, m%4
+ REPX {paddsw x, x}, m%1, m%2, m%3, m%4
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
+ mova ym19, [dstq+strideq*0]
+ vinserti32x8 m19, [dstq+strideq*8], 1
+%if %3 == 0
+ mova [cq+64*(%1*2+0)], m20
+ mova [cq+64*(%1*2+1)], m20
+%endif
+ punpcklbw m18, m19, m20
+ punpckhbw m19, m20
+ paddw m18, m%1
+ paddw m19, m%2
+ packuswb m18, m19
+ mova [dstq+strideq*0], ym18
+ vextracti32x8 [dstq+strideq*8], m18, 1
+%if %3 || %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
+ vpbroadcastd m12, [pw_2896x8]
+ movu m14, [permB+7]
+ vpbroadcastd m17, [pw_1697x16]
+ psrlq m16, m14, 4
+ IDTX_32x16 0, 1, 2, 3
+ IDTX_32x16 4, 5, 6, 7
+ IDTX_32x16 8, 9, 10, 11
+ IDTX_32x16 12, 13, 14, 15
+ vpbroadcastd m16, [pw_2048]
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ pxor m20, m20
+ IDTX_32x16_STORE 0, 8
+ IDTX_32x16_STORE 1, 9
+ IDTX_32x16_STORE 2, 10
+ IDTX_32x16_STORE 3, 11
+ IDTX_32x16_STORE 4, 12
+ IDTX_32x16_STORE 5, 13
+ IDTX_32x16_STORE 6, 14
+ IDTX_32x16_STORE 7, 15
+ RET
+
+%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
+ pmovzxbw m10, [dstq+%3]
+ pmovzxbw m11, [r3 +%4]
+%if %2 < 8
+ paddsw m8, m%2, m%1
+ psubsw m9, m%2, m%1
+%else
+ mova m9, [cq+64*(%2*2-16)]
+ paddsw m8, m9, m%1
+ psubsw m9, m%1
+%endif
+ pmulhrsw m8, m12
+ pmulhrsw m9, m12
+%if %2 >= 8
+%if %2 == 8
+ pxor m0, m0
+%endif
+ mova [cq+64*(%2*2-16)], m0
+ mova [cq+64*(%2*2-15)], m0
+%endif
+ paddw m8, m10
+ paddw m9, m11
+ packuswb m8, m9
+ vpermq m8, m13, m8
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jb .fast
+ mova m5, [cq+64*20]
+ mova m3, [cq+64*12]
+ mova m1, [cq+64* 4]
+ mova m7, [cq+64*28]
+ mova m2, [cq+64* 8]
+ mova m6, [cq+64*24]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 2]
+ mova m21, [cq+64*30]
+ mova m18, [cq+64*18]
+ mova m17, [cq+64*14]
+ mova m16, [cq+64*10]
+ mova m19, [cq+64*22]
+ mova m20, [cq+64*26]
+ mova m15, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 1]
+ mova m21, [cq+64*31]
+ mova m14, [cq+64*17]
+ mova m29, [cq+64*15]
+ mova m26, [cq+64* 9]
+ mova m17, [cq+64*23]
+ mova m18, [cq+64*25]
+ mova m25, [cq+64* 7]
+ mova m24, [cq+64* 5]
+ mova m19, [cq+64*27]
+ mova m16, [cq+64*21]
+ mova m27, [cq+64*11]
+ mova m28, [cq+64*13]
+ mova m15, [cq+64*19]
+ mova m20, [cq+64*29]
+ mova m23, [cq+64* 3]
+ call .main_oddhalf
+ vpbroadcastd m10, [o(pw_8192)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m10}, m0, m4, m8, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m10}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ punpcklqdq m5, m23, m27 ; d00 d08 d16 d24
+ punpckhqdq m23, m27 ; d01 d09 d17 d25
+ punpckhqdq m27, m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m25, m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m26, m9, m24 ; d07 d15 d23 d31
+ punpcklqdq m9, m24 ; d06 d14 d22 d30
+ REPX {pmulhrsw x, m10}, m25, m3, m26
+ mova [cq+64* 9], m23
+ mova [cq+64*11], m27
+ mova [cq+64*13], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m12, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m12, m19 ; 21
+ paddsw m12, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ paddsw m16, m4, m15 ; 14
+ psubsw m4, m15 ; 17
+ pmulhrsw m15, m6, m10
+ psubsw m6, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ pmulhrsw m14, m7, m10
+ punpcklwd m7, m6, m4
+ punpckhwd m6, m4
+ punpckhwd m4, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m23, m10
+ pmulhrsw m25, m10
+ punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m10}, m28, m2, m12, m27
+ punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m10}, m16, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m6, m4
+ punpckldq m6, m4
+ REPX {pmulhrsw x, m10}, m26, m19, m21, m6
+ punpckhdq m4, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m7, m17
+ punpckldq m7, m17
+ REPX {pmulhrsw x, m10}, m4, m18, m20, m7
+ punpcklqdq m17, m28, m12 ; b02 b10 b18 b26
+ punpckhqdq m28, m12 ; b03 b11 b19 b27
+ punpckhqdq m12, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpckhqdq m27, m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m29, m16, m11 ; b07 b15 b23 b31
+ punpcklqdq m16, m11 ; b06 b14 b22 b30
+ mova [cq+64* 1], m12
+ mova [cq+64* 3], m28
+ mova [cq+64* 5], m27
+ mova [cq+64* 7], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m7, m19 ; c01 c09 c17 c25
+ punpcklqdq m7, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m6, m18 ; c05 c13 c21 c29
+ punpcklqdq m6, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m4 ; c07 c15 c23 c31
+ punpcklqdq m21, m4 ; c06 c14 c22 c30
+ pmulhrsw m19, m9, m10
+ vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08
+ vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08
+ vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12
+ vshufi32x4 m3, m1, m6, q3131 ; 12
+ vshufi32x4 m1, m6, q2020 ; 4
+ vshufi32x4 m6, m4, m2, q3131 ; 24
+ vshufi32x4 m4, m2, q2020 ; 16
+ vshufi32x4 m2, m0, m7, q3131 ; 8
+ vshufi32x4 m0, m7, q2020 ; 0
+ vshufi32x4 m7, m5, m8, q3131 ; 28
+ vshufi32x4 m5, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10
+ vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10
+ vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14
+ vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14
+ vshufi32x4 m16, m14, m20, q3131 ; 10
+ vshufi32x4 m14, m20, q2020 ; 2
+ vshufi32x4 m20, m18, m17, q3131 ; 26
+ vshufi32x4 m18, m17, q2020 ; 18
+ vshufi32x4 m17, m15, m21, q3131 ; 14
+ vshufi32x4 m15, m21, q2020 ; 6
+ vshufi32x4 m21, m19, m13, q3131 ; 30
+ vshufi32x4 m19, m13, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m15, [cq+64* 1]
+ mova m16, [cq+64* 3]
+ mova m17, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m13, [cq+64*13]
+ mova m18, [cq+64*15]
+ vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15
+ vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31
+ vshufi32x4 m18, m14, m26, q3131 ; 25
+ vshufi32x4 m14, m26, q2020 ; 17
+ vshufi32x4 m19, m15, m27, q3131 ; 27
+ vshufi32x4 m15, m27, q2020 ; 19
+ vshufi32x4 m20, m16, m28, q3131 ; 29
+ vshufi32x4 m16, m28, q2020 ; 21
+ vshufi32x4 m21, m17, m29, q3131 ; 31
+ vshufi32x4 m17, m29, q2020 ; 23
+ vshufi32x4 m26, m22, m8, q3131 ; 9
+ vshufi32x4 m22, m8, q2020 ; 1
+ vshufi32x4 m27, m23, m9, q3131 ; 11
+ vshufi32x4 m23, m9, q2020 ; 3
+ vshufi32x4 m28, m24, m11, q3131 ; 13
+ vshufi32x4 m24, m11, q2020 ; 5
+ vshufi32x4 m29, m25, m12, q3131 ; 15
+ vshufi32x4 m25, m12, q2020 ; 7
+ call .main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m14, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ pmovzxwd m6, [cq+64* 8]
+ vpermb m8, m14, [cq+64* 2]
+ vpermb ym0, ym14, [cq+64*14]
+ vpermb ym5, ym14, [cq+64*10]
+ vpermb m1, m14, [cq+64* 6]
+ vpermb m7, m14, [cq+64* 4]
+ vpermb ym3, ym14, [cq+64*12]
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m21, m14, [cq+64* 1]
+ vpermb ym17, ym14, [cq+64*15]
+ vpermb ym20, ym14, [cq+64* 9]
+ vpermb m15, m14, [cq+64* 7]
+ vpermb m18, m14, [cq+64* 5]
+ vpermb ym16, ym14, [cq+64*11]
+ vpermb ym19, ym14, [cq+64*13]
+ vpermb m14, m14, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m22, m14, m2, q2020 ; 1
+ vshufi32x4 m24, m14, m2, q3131 ; 5
+ vshufi32x4 m23, m17, m9, q2020 ; 3
+ vshufi32x4 m25, m17, m9, q3131 ; 7
+ vshufi32x4 m16, m5, m15, q2020 ; 10
+ vshufi32x4 m17, m5, m15, q3131 ; 14
+ vshufi32x4 m14, m1, m18, q2020 ; 2
+ vshufi32x4 m15, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m21, m4, q3131 ; 12
+ vshufi32x4 m2, m21, m4, q2020 ; 8
+ vshufi32x4 m26, m20, m6, q2020 ; 9
+ vshufi32x4 m28, m20, m6, q3131 ; 13
+ vshufi32x4 m27, m19, m7, q2020 ; 11
+ vshufi32x4 m29, m19, m7, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ call .main_oddhalf_fast
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r3, [dstq+r4*8]
+ lea r5, [strideq+r4] ; stride*4
+ add r3, r5 ; dst+stride*28
+ IDCT_32x32_END 29, 0, strideq*0, r4
+ IDCT_32x32_END 28, 1, strideq*1, strideq*2
+ IDCT_32x32_END 27, 2, strideq*2, strideq*1
+ IDCT_32x32_END 26, 3, r4 , strideq*0
+ IDCT_32x32_END 25, 4, strideq*0, r4
+ IDCT_32x32_END 24, 5, strideq*1, strideq*2
+ IDCT_32x32_END 23, 6, strideq*2, strideq*1
+ IDCT_32x32_END 22, 7, r4 , strideq*0
+ IDCT_32x32_END 21, 8, strideq*0, r4
+ IDCT_32x32_END 20, 9, strideq*1, strideq*2
+ IDCT_32x32_END 19, 10, strideq*2, strideq*1
+ IDCT_32x32_END 18, 11, r4 , strideq*0
+ IDCT_32x32_END 17, 12, strideq*0, r4
+ IDCT_32x32_END 16, 13, strideq*1, strideq*2
+ IDCT_32x32_END 15, 14, strideq*2, strideq*1
+ IDCT_32x32_END 14, 15, r4 , strideq*0
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m9, [o(pw_3857x8)]
+ vpbroadcastd m19, [o(pw_3973x8)]
+ vpbroadcastd m11, [o(pw_995x8)]
+ vpbroadcastd m28, [o(pw_m601x8)]
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m21, m22 ; t31a
+ pmulhrsw m22, m8 ; t16a
+ pmulhrsw m18, m25 ; t19a
+ pmulhrsw m25, m9 ; t28a
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m11 ; t20a
+ pmulhrsw m28, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ mova m15, m21
+ mova m8, m22
+ mova m14, m18
+ mova m27, m25
+ mova m29, m19
+ mova m26, m24
+ mova m16, m28
+ mova m20, m23
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m14, [o(pw_m2751x8)]
+ vpbroadcastd m9, [o(pw_3035x8)]
+ vpbroadcastd m17, [o(pw_3703x8)]
+ vpbroadcastd m11, [o(pw_1751x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m12, [o(pw_3857x8)]
+ pmulhrsw m21, m22 ; t31a
+ vpbroadcastd m19, [o(pw_3973x8)]
+ pmulhrsw m22, m8 ; t16a
+ vpbroadcastd m8, [o(pw_995x8)]
+ pmulhrsw m14, m29 ; t30a
+ vpbroadcastd m16, [o(pw_m2106x8)]
+ pmulhrsw m29, m9 ; t17a
+ vpbroadcastd m9, [o(pw_3513x8)]
+ pmulhrsw m17, m26 ; t29a
+ vpbroadcastd m15, [o(pw_3290x8)]
+ pmulhrsw m26, m11 ; t18a
+ vpbroadcastd m11, [o(pw_2440x8)]
+ pmulhrsw m18, m25 ; t19a
+ vpbroadcastd m20, [o(pw_m601x8)]
+ pmulhrsw m25, m12 ; t28a
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m8 ; t20a
+ pmulhrsw m16, m27 ; t21a
+ pmulhrsw m27, m9 ; t26a
+ pmulhrsw m15, m28 ; t25a
+ pmulhrsw m28, m11 ; t22a
+ pmulhrsw m20, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m8, m22, m14 ; t17
+ paddsw m22, m14 ; t16
+ paddsw m14, m18, m26 ; t19
+ psubsw m18, m26 ; t18
+ psubsw m26, m24, m16 ; t21
+ paddsw m24, m16 ; t20
+ psubsw m16, m20, m28 ; t22
+ paddsw m28, m20 ; t23
+ psubsw m20, m23, m15 ; t25
+ paddsw m23, m15 ; t24
+ psubsw m15, m21, m29 ; t30
+ paddsw m21, m29 ; t31
+ psubsw m29, m19, m27 ; t26
+ paddsw m19, m27 ; t27
+ paddsw m27, m25, m17 ; t28
+ psubsw m25, m17 ; t29
+.main3:
+ ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ psubsw m17, m21, m27 ; t28a
+ paddsw m21, m27 ; t31a
+ psubsw m27, m15, m25 ; t18
+ paddsw m15, m25 ; t17
+ psubsw m25, m20, m29 ; t21
+ paddsw m20, m29 ; t22
+ psubsw m29, m8, m18 ; t29
+ paddsw m8, m18 ; t30
+ psubsw m18, m22, m14 ; t19a
+ paddsw m22, m14 ; t16a
+ psubsw m14, m28, m24 ; t20a
+ paddsw m24, m28 ; t23a
+ paddsw m28, m16, m26 ; t25
+ psubsw m16, m26 ; t26
+ psubsw m26, m23, m19 ; t27a
+ paddsw m23, m19 ; t24a
+ ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a
+ ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a
+ ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ psubsw m19, m27, m25 ; t26
+ paddsw m27, m25 ; t29
+ psubsw m25, m17, m26 ; t20a
+ paddsw m17, m26 ; t19a
+ paddsw m26, m18, m14 ; t28a
+ psubsw m18, m14 ; t27a
+ paddsw m14, m22, m24 ; t16
+ psubsw m22, m24 ; t23
+ psubsw m24, m29, m16 ; t21
+ paddsw m16, m29 ; t18
+ paddsw m29, m21, m23 ; t31
+ psubsw m21, m23 ; t24
+ psubsw m23, m15, m20 ; t22a
+ paddsw m15, m20 ; t17a
+ psubsw m20, m8, m28 ; t25a
+ paddsw m28, m8 ; t30a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27
+ ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a
+ ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25
+ ret
+
+%macro IDTX_32x32 2 ; dst[1-2]
+ vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
+ vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to
+ vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements
+ vmovdqa32 ym18, [cq+64*(%2+16)]
+ vpermt2q m%1, m21, m17
+ vpermt2q m%2, m21, m18
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
+ movu m21, [permB+7]
+ vpbroadcastd m16, [pw_8192]
+ pxor m20, m20
+.loop:
+ IDTX_32x32 0, 1
+ IDTX_32x32 2, 3
+ IDTX_32x32 4, 5
+ IDTX_32x32 6, 7
+ IDTX_32x32 8, 9
+ IDTX_32x32 10, 11
+ IDTX_32x32 12, 13
+ IDTX_32x32 14, 15
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ IDTX_32x16_STORE 0, 8, 1
+ IDTX_32x16_STORE 1, 9, 1
+ IDTX_32x16_STORE 2, 10, 1
+ IDTX_32x16_STORE 3, 11, 1
+ IDTX_32x16_STORE 4, 12, 1
+ IDTX_32x16_STORE 5, 13, 1
+ IDTX_32x16_STORE 6, 14, 1
+ IDTX_32x16_STORE 7, 15, 1
+ lea dstq, [dstq+strideq*8]
+ btc cq, 5
+ jnc .loop
+ mov r0d, 8
+.zero_loop:
+ mova [cq+64*0], m20
+ mova [cq+64*1], m20
+ mova [cq+64*2], m20
+ mova [cq+64*3], m20
+ add cq, 64*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 151
+ jb .fast
+ mova m5, [cq+64*10]
+ mova m3, [cq+64* 6]
+ mova m1, [cq+64* 2]
+ mova m7, [cq+64*14]
+ mova m2, [cq+64* 4]
+ mova m6, [cq+64*12]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 1]
+ mova m21, [cq+64*15]
+ mova m18, [cq+64* 9]
+ mova m17, [cq+64* 7]
+ mova m16, [cq+64* 5]
+ mova m19, [cq+64*11]
+ mova m20, [cq+64*13]
+ mova m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ vpbroadcastd m9, [o(pw_8192)]
+%macro TRANSPOSE_8x4_ROUND 4
+ punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7
+ REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
+%endmacro
+ TRANSPOSE_8x4_ROUND 0, 1, 2, 3
+ TRANSPOSE_8x4_ROUND 4, 5, 6, 7
+ TRANSPOSE_8x4_ROUND 14, 15, 16, 17
+ TRANSPOSE_8x4_ROUND 18, 19, 20, 21
+ vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4
+ vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12
+ vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5
+ vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13
+ vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6
+ vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14
+ vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7
+ vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15
+ vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4
+ vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12
+ vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5
+ vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13
+ vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6
+ vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14
+ vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7
+ vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15
+ vshufi32x4 m22, m26, m4, q2020 ; 0 1
+ vshufi32x4 m26, m4, q3131 ; 8 9
+ vshufi32x4 m23, m27, m5, q2020 ; 2 3
+ vshufi32x4 m27, m5, q3131 ; 10 11
+ vshufi32x4 m24, m28, m6, q2020 ; 4 5
+ vshufi32x4 m28, m6, q3131 ; 12 13
+ vshufi32x4 m25, m29, m7, q2020 ; 6 7
+ vshufi32x4 m29, m7, q3131 ; 14 15
+ vshufi32x4 m4, m0, m14, q2020 ; 16 17
+ vshufi32x4 m3, m0, m14, q3131 ; 24 25
+ vshufi32x4 m20, m1, m15, q2020 ; 18 19
+ vshufi32x4 m19, m1, m15, q3131 ; 26 27
+ vshufi32x4 m5, m2, m16, q2020 ; 20 21
+ vshufi32x4 m0, m2, m16, q3131 ; 28 29
+ vshufi32x4 m16, m8, m17, q2020 ; 22 23
+ vshufi32x4 m17, m8, m17, q3131 ; 30 31
+ pxor m6, m6
+ mova [cq+64* 0], m4
+ mova [cq+64* 2], m5
+ mova [cq+64* 4], m3
+ mova [cq+64* 6], m0
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m0, m0 ; 28
+ punpcklwd m5, m5 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m3 ; 24
+ punpcklwd m9, m6, m22 ; __ 0
+ punpcklwd m6, m4 ; __ 16
+ call m(idct_16x16_internal_8bpc).main_fast3
+ mova [cq+64* 1], m20
+ mova [cq+64* 3], m16
+ mova [cq+64* 5], m19
+ mova [cq+64* 7], m17
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m17, m17 ; 30
+ punpcklwd m20, m20 ; 18
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m16, m16 ; 22
+ punpcklwd m19, m19 ; 26
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ mova m21, [cq+64* 7]
+ mova m14, [cq+64* 0]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 4]
+ mova m19, [cq+64* 5]
+ mova m16, [cq+64* 2]
+ mova m15, [cq+64* 1]
+ mova m20, [cq+64* 6]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call .main_oddhalf
+ jmp .end
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_8192)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m28, m0, m1
+ punpckldq m0, m1
+ punpckhdq m27, m2, m3
+ punpckldq m2, m3
+ punpckhdq m22, m4, m5
+ punpckldq m4, m5
+ punpckhdq m23, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m14, m0, ym2, 1
+ vshufi32x4 m15, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vshufi32x4 m21, m14, m2, q2020 ; 0 2
+ vshufi32x4 m14, m2, q3131 ; 4 6
+ vshufi32x4 m18, m15, m4, q2020 ; 8 10
+ vshufi32x4 m15, m4, q3131 ; 12 14
+ pxor m9, m9
+ punpcklwd m8, m14, m14 ; 4
+ punpcklwd m1, m15, m15 ; 12
+ punpcklwd m7, m18, m18 ; 8
+ punpcklwd m9, m21 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast4
+ punpckhwd m21, m21 ; 2
+ punpckhwd m15, m15 ; 14
+ punpckhwd m18, m18 ; 10
+ punpckhwd m14, m14 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vinserti32x8 m24, m28, ym27, 1
+ vshufi32x4 m28, m27, q3232
+ vinserti32x8 m27, m22, ym23, 1
+ vshufi32x4 m22, m23, q3232
+ vshufi32x4 m23, m24, m27, q2020 ; 1 3
+ vshufi32x4 m24, m27, q3131 ; 5 7
+ vshufi32x4 m27, m28, m22, q2020 ; 9 11
+ vshufi32x4 m28, m22, q3131 ; 13 15
+ punpcklwd m22, m23, m23 ; 1
+ punpckhwd m29, m28, m28 ; 15
+ punpcklwd m26, m27, m27 ; 9
+ punpckhwd m25, m24, m24 ; 7
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ punpcklwd m24, m24 ; 5
+ punpckhwd m27, m27 ; 11
+ punpcklwd m28, m28 ; 13
+ punpckhwd m23, m23 ; 3
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call .main_oddhalf_fast
+.end:
+ imul r6, strideq, 60
+ mova m10, [o(end_16x32p)]
+ vpbroadcastd m11, [o(pw_2048)]
+ lea r3, [strideq*3]
+ pxor m12, m12
+ add r6, dstq ; dst+stride*60
+ psrldq m13, m10, 1
+ lea r4, [strideq+r3] ; stride*4
+%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
+%if %1 & 1
+ %define %%s0 r3
+ %define %%s1 strideq*2
+ %define %%s2 strideq*1
+ %define %%s3 strideq*0
+%else
+ %define %%s0 strideq*0
+ %define %%s1 strideq*1
+ %define %%s2 strideq*2
+ %define %%s3 r3
+%if %1
+ add dstq, r4
+ sub r6, r4
+%endif
+%endif
+%if %1 < 8
+ pmulhrsw m8, m11, m%1
+ pmulhrsw m9, m11, m%2
+%else
+ mova m9, [cq+64*%1]
+ paddsw m8, m9, m%2 ; out 0+n, 1+n
+ psubsw m9, m%2 ; out 63-n, 62-n
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+%endif
+ mova xm29, [dstq+%%s0]
+ vinserti128 ym29, [dstq+%%s1], 1
+ mova xm%3, [r6 +%%s3]
+ vinserti128 ym%3, [r6 +%%s2], 1
+ vpermb m29, m10, m29
+ vpermb m%3, m10, m%3
+ mova [cq+64*%1], m12
+ paddw m29, m8
+ paddw m%3, m9
+ packuswb m29, m%3
+ vpermd m29, m13, m29
+ mova [dstq+%%s0], xm29
+ vextracti128 [dstq+%%s1], ym29, 1
+ vextracti32x4 [r6 +%%s2], m29, 2
+ vextracti32x4 [r6 +%%s3], m29, 3
+%endmacro
+ IDCT_16x64_END 0, 29, 0
+ IDCT_16x64_END 1, 28, 28
+ IDCT_16x64_END 2, 27, 28
+ IDCT_16x64_END 3, 26, 28
+ IDCT_16x64_END 4, 25, 28
+ IDCT_16x64_END 5, 24, 28
+ IDCT_16x64_END 6, 23, 28
+ IDCT_16x64_END 7, 22, 28
+ IDCT_16x64_END 8, 21, 28
+ IDCT_16x64_END 9, 20, 28
+ IDCT_16x64_END 10, 19, 28
+ IDCT_16x64_END 11, 18, 28
+ IDCT_16x64_END 12, 17, 28
+ IDCT_16x64_END 13, 16, 28
+ IDCT_16x64_END 14, 15, 28
+ IDCT_16x64_END 15, 14, 28
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m21, [o(pw_m1474_3822x8)]
+ vpbroadcastd m14, [o(pw_897_3996x8)]
+ vpbroadcastd m17, [o(pw_m700_4036x8)]
+ vpbroadcastd m18, [o(pw_501_4065x8)]
+ vpbroadcastd m19, [o(pw_m1092_3948x8)]
+ vpbroadcastd m16, [o(pw_1285_3889x8)]
+ vpbroadcastd m15, [o(pw_m301_4085x8)]
+ pmulhrsw m8, m22 ; t32a t63a
+ pmulhrsw m21, m29 ; t35a t60a
+ pmulhrsw m14, m26 ; t36a t59a
+ pmulhrsw m17, m25 ; t39a t56
+ pmulhrsw m18, m24 ; t40a t55a
+ pmulhrsw m19, m27 ; t43a t52a
+ pmulhrsw m16, m28 ; t44a t51a
+ pmulhrsw m15, m23 ; t47a t48a
+ mova m22, m8
+ mova m29, m21
+ mova m26, m14
+ mova m25, m17
+ mova m24, m18
+ mova m27, m19
+ mova m28, m16
+ mova m20, m15
+ jmp .main_oddhalf2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m9, [o(pw_m2824_2967x8)]
+ vpbroadcastd m11, [o(pw_1660_3745x8)]
+ vpbroadcastd m12, [o(pw_m1474_3822x8)]
+ pmulhrsw m22, m8 ; t32a t63a
+ vpbroadcastd m8, [o(pw_897_3996x8)]
+ pmulhrsw m21, m9 ; t33a t62a
+ vpbroadcastd m9, [o(pw_m2191_3461x8)]
+ pmulhrsw m14, m11 ; t34a t61a
+ vpbroadcastd m11, [o(pw_2359_3349x8)]
+ pmulhrsw m29, m12 ; t35a t60a
+ vpbroadcastd m12, [o(pw_m700_4036x8)]
+ pmulhrsw m26, m8 ; t36a t59a
+ vpbroadcastd m8, [o(pw_501_4065x8)]
+ pmulhrsw m17, m9 ; t37a t58a
+ vpbroadcastd m9, [o(pw_m2520_3229x8)]
+ pmulhrsw m18, m11 ; t38a t57a
+ vpbroadcastd m11, [o(pw_2019_3564x8)]
+ pmulhrsw m25, m12 ; t39a t56a
+ vpbroadcastd m12, [o(pw_m1092_3948x8)]
+ pmulhrsw m24, m8 ; t40a t55a
+ vpbroadcastd m8, [o(pw_1285_3889x8)]
+ pmulhrsw m19, m9 ; t41a t54a
+ vpbroadcastd m9, [o(pw_m1842_3659x8)]
+ pmulhrsw m16, m11 ; t42a t53a
+ vpbroadcastd m11, [o(pw_2675_3102x8)]
+ pmulhrsw m27, m12 ; t43a t52a
+ vpbroadcastd m12, [o(pw_m301_4085x8)]
+ pmulhrsw m28, m8 ; t44a t51a
+ pmulhrsw m15, m9 ; t45a t50a
+ pmulhrsw m20, m11 ; t46a t49a
+ pmulhrsw m23, m12 ; t47a t48a
+ psubsw m8, m22, m21 ; t33 t62
+ paddsw m22, m21 ; t32 t63
+ psubsw m21, m29, m14 ; t34 t61
+ paddsw m29, m14 ; t35 t60
+ psubsw m14, m26, m17 ; t37 t58
+ paddsw m26, m17 ; t36 t59
+ psubsw m17, m25, m18 ; t38 t57
+ paddsw m25, m18 ; t39 t56
+ psubsw m18, m24, m19 ; t41 t54
+ paddsw m24, m19 ; t40 t55
+ psubsw m19, m27, m16 ; t42 t53
+ paddsw m27, m16 ; t43 t52
+ psubsw m16, m28, m15 ; t45 t50
+ paddsw m28, m15 ; t44 t51
+ psubsw m15, m23, m20 ; t46 t49
+ paddsw m20, m23 ; t47 t48
+.main_oddhalf2:
+ ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a
+ ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a
+ ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a
+ ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
+ ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a
+ ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
+ ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a
+ ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ psubsw m23, m25, m26 ; t36a t59a
+ paddsw m25, m26 ; t39a t56a
+ psubsw m26, m24, m27 ; t43a t52a
+ paddsw m27, m24 ; t40a t55a
+ psubsw m24, m20, m28 ; t44a t51a
+ paddsw m20, m28 ; t47a t48a
+ psubsw m28, m8, m21 ; t34 t61
+ paddsw m8, m21 ; t33 t62
+ psubsw m21, m17, m14 ; t37 t58
+ paddsw m17, m14 ; t38 t57
+ psubsw m14, m18, m19 ; t42 t53
+ paddsw m18, m19 ; t41 t54
+ psubsw m19, m15, m16 ; t45 t50
+ paddsw m15, m16 ; t46 t49
+ psubsw m16, m22, m29 ; t35a t60a
+ paddsw m22, m29 ; t32a t63a
+ ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60
+ ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a
+ ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59
+ ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52
+ ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a
+ ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51
+ ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ psubsw m29, m22, m25 ; t39 t56
+ paddsw m22, m25 ; t32 t63
+ psubsw m25, m20, m27 ; t40 t55
+ paddsw m20, m27 ; t47 t48
+ psubsw m27, m8, m17 ; t38a t57a
+ paddsw m8, m17 ; t33a t62a
+ psubsw m17, m15, m18 ; t41a t54a
+ paddsw m15, m18 ; t46a t49a
+ paddsw m18, m16, m23 ; t35a t60a
+ psubsw m16, m23 ; t36a t59a
+ psubsw m23, m24, m26 ; t43a t52a
+ paddsw m24, m26 ; t44a t51a
+ paddsw m26, m28, m21 ; t34 t61
+ psubsw m28, m21 ; t37 t58
+ psubsw m21, m19, m14 ; t42 t53
+ paddsw m19, m14 ; t45 t50
+ ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a
+ ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57
+ ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59
+ ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a
+ ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54
+ ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52
+ ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a
+ vbroadcasti32x4 m13, [o(deint_shuf)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ paddsw m14, m22, m20 ; t32a t63a
+ psubsw m22, m20 ; t47a t48a
+ psubsw m20, m8, m15 ; t46 t49
+ paddsw m8, m15 ; t33 t62
+ paddsw m15, m18, m24 ; t35 t60
+ psubsw m18, m24 ; t44 t51
+ psubsw m24, m26, m19 ; t45a t50a
+ paddsw m26, m19 ; t34a t61a
+ REPX {pshufb x, m13}, m14, m8, m15, m26
+ psubsw m19, m29, m25 ; t40 t55
+ paddsw m25, m29 ; t39 t56
+ psubsw m29, m27, m17 ; t41a t54a
+ paddsw m27, m17 ; t38a t57a
+ psubsw m17, m16, m23 ; t43a t52a
+ paddsw m16, m23 ; t36a t59a
+ psubsw m9, m28, m21 ; t42 t53
+ paddsw m28, m21 ; t37 t58
+ REPX {pshufb x, m13}, m25, m27, m16, m28
+ ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48
+ ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a
+ packssdw m21, m22 ; t47 t46a
+ packssdw m13, m23 ; t48 t49a
+ ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a
+ ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50
+ packssdw m20, m18 ; t44a t45
+ packssdw m22, m23 ; t51a t50
+ ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a
+ ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54
+ packssdw m18, m19 ; t40a t41
+ packssdw m24, m23 ; t55a t54
+ ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52
+ ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a
+ packssdw m19, m17 ; t43 t42a
+ packssdw m23, m29 ; t52 t53a
+ punpcklqdq m17, m25, m27 ; t39 t38a
+ punpckhqdq m25, m27 ; t56 t57a
+ punpckhqdq m27, m15, m26 ; t60 t61a
+ punpcklqdq m15, m26 ; t35 t34a
+ punpckhqdq m26, m16, m28 ; t59a t58
+ punpcklqdq m16, m28 ; t36a t37
+ punpckhqdq m28, m14, m8 ; t63a t62
+ punpcklqdq m14, m8 ; t32a t33
+ psubsw m29, m0, m28 ; out63 out62
+ paddsw m0, m28 ; out0 out1
+ psubsw m28, m1, m27 ; out60 out61
+ paddsw m1, m27 ; out3 out2
+ psubsw m27, m2, m26 ; out59 out58
+ paddsw m2, m26 ; out4 out5
+ psubsw m26, m3, m25 ; out56 out57
+ paddsw m3, m25 ; out7 out6
+ psubsw m25, m4, m24 ; out55 out54
+ paddsw m4, m24 ; out8 out9
+ psubsw m24, m5, m23 ; out52 out53
+ paddsw m5, m23 ; out11 out10
+ psubsw m23, m6, m22 ; out51 out50
+ paddsw m6, m22 ; out12 out13
+ psubsw m22, m7, m13 ; out48 out49
+ paddsw m7, m13 ; out15 out14
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+.dconly:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova m1, [dstq]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ WIN64_SPILL_XMM 31
+ mova m19, [o(dup16_perm)]
+ mova m24, [cq+64* 2]
+ mova m28, [cq+64* 6]
+ mova m26, [cq+64* 4]
+ mova m22, [cq+64* 0]
+ mova m23, [cq+64* 1]
+ mova m29, [cq+64* 7]
+ mova m27, [cq+64* 5]
+ mova m25, [cq+64* 3]
+ vpermb m8, m19, m24 ; 4
+ vpermb m1, m19, m28 ; 12
+ vpermb m7, m19, m26 ; 8
+ vpermb m9, m19, m22 ; __ 0
+ vpermb m21, m19, m23 ; 2
+ vpermb m15, m19, m29 ; 14
+ vpermb m18, m19, m27 ; 10
+ vpermb m14, m19, m25 ; 6
+ pslld m9, 16
+ vpord m30, m19, [o(pb_32)] {1to16}
+ REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
+ cmp eobd, 151
+ jb .fast
+ vpermb m0, m19, [cq+64*14] ; 28
+ vpermb m5, m19, [cq+64*10] ; 20
+ vpermb m3, m19, [cq+64*12] ; 24
+ vpermb m6, m19, [cq+64* 8] ; __ 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m17, m19, [cq+64*15] ; 30
+ vpermb m20, m19, [cq+64* 9] ; 18
+ vpermb m16, m19, [cq+64*11] ; 22
+ vpermb m19, m19, [cq+64*13] ; 26
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ vpermb m21, m30, [cq+64*15]
+ vpermb m14, m30, [cq+64* 8]
+ vpermb m17, m30, [cq+64*11]
+ vpermb m18, m30, [cq+64*12]
+ vpermb m19, m30, [cq+64*13]
+ vpermb m16, m30, [cq+64*10]
+ vpermb m15, m30, [cq+64* 9]
+ vpermb m20, m30, [cq+64*14]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom half is zero
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+.end:
+ mova [cq+64* 8], m4
+ mova [cq+64* 9], m5
+ mova [cq+64*10], m6
+ mova [cq+64*11], m7
+ mova [cq+64*12], m26
+ mova [cq+64*13], m27
+ mova [cq+64*14], m28
+ mova [cq+64*15], m29
+ vpbroadcastd m13, [o(pw_8192)]
+ call .pass1_end
+ call .pass2
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [cq+64* 8]
+ pmulhrsw m1, m13, [cq+64* 9]
+ pmulhrsw m2, m13, [cq+64*10]
+ pmulhrsw m3, m13, [cq+64*11]
+ vpbroadcastd m30, [o(pw_2048)]
+ pmulhrsw m4, m13, m22
+ pmulhrsw m5, m13, m23
+ pmulhrsw m6, m13, m24
+ pmulhrsw m7, m13, m25
+ pmulhrsw m22, m30, m14
+ pmulhrsw m14, m13, m26
+ pmulhrsw m23, m30, m15
+ pmulhrsw m15, m13, m27
+ pmulhrsw m24, m30, m16
+ pmulhrsw m16, m13, m28
+ pmulhrsw m25, m30, m17
+ pmulhrsw m17, m13, m29
+ pmulhrsw m26, m30, m18
+ pmulhrsw m18, m13, [cq+64*12]
+ pmulhrsw m27, m30, m19
+ pmulhrsw m19, m13, [cq+64*13]
+ pmulhrsw m28, m30, m20
+ pmulhrsw m20, m13, [cq+64*14]
+ pmulhrsw m29, m30, m21
+ pmulhrsw m21, m13, [cq+64*15]
+ call .transpose_round
+ call .pass2
+ pxor m10, m10
+ lea r3, [strideq*3]
+%macro IDCT_64x16_END 4
+ mova m9, [dstq+%4]
+%if %1 < 8
+ pmulhrsw m%3, m30, [cq+64*%1]
+%endif
+ pmulhrsw m%2, m30
+ mova [cq+64*%1], m10
+ punpcklbw m8, m9, m10
+ punpckhbw m9, m10
+ paddw m8, m%3
+ paddw m9, m%2
+ packuswb m8, m9
+ mova [dstq+%4], m8
+%if %1 == 3 || %1 == 7 || %1 == 11
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+ IDCT_64x16_END 0, 0, 11, strideq*0
+ IDCT_64x16_END 1, 1, 11, strideq*1
+ IDCT_64x16_END 2, 2, 11, strideq*2
+ IDCT_64x16_END 3, 3, 11, r3
+ IDCT_64x16_END 4, 4, 11, strideq*0
+ IDCT_64x16_END 5, 5, 11, strideq*1
+ IDCT_64x16_END 6, 6, 11, strideq*2
+ IDCT_64x16_END 7, 7, 11, r3
+ IDCT_64x16_END 8, 14, 22, strideq*0
+ IDCT_64x16_END 9, 15, 23, strideq*1
+ IDCT_64x16_END 10, 16, 24, strideq*2
+ IDCT_64x16_END 11, 17, 25, r3
+ IDCT_64x16_END 12, 18, 26, strideq*0
+ IDCT_64x16_END 13, 19, 27, strideq*1
+ IDCT_64x16_END 14, 20, 28, strideq*2
+ IDCT_64x16_END 15, 21, 29, r3
+ RET
+ALIGN function_align
+.pass1_end:
+ mova m4, [cq+64* 0]
+ mova m5, [cq+64* 1]
+ mova m6, [cq+64* 2]
+ mova m7, [cq+64* 3]
+ mova m8, [cq+64* 4]
+ mova m9, [cq+64* 5]
+ mova m11, [cq+64* 6]
+ mova m12, [cq+64* 7]
+ psubsw m29, m4, m21 ; out47 out46
+ paddsw m4, m21 ; out16 out17
+ psubsw m28, m5, m20 ; out44 out45
+ paddsw m5, m20 ; out19 out18
+ REPX {pmulhrsw x, m13}, m0, m1, m2, m3
+ psubsw m27, m6, m19 ; out43 out42
+ paddsw m6, m19 ; out20 out21
+ psubsw m26, m7, m18 ; out40 out41
+ paddsw m7, m18 ; out23 out22
+ pmulhrsw m18, m13, m22
+ pmulhrsw m19, m13, m23
+ pmulhrsw m20, m13, m24
+ pmulhrsw m21, m13, m25
+ paddsw m25, m12, m14 ; out31 out30
+ psubsw m14, m12, m14 ; out32 out33
+ paddsw m24, m11, m15 ; out28 out29
+ psubsw m15, m11, m15 ; out35 out34
+ REPX {pmulhrsw x, m13}, m4, m5, m6, m7
+ paddsw m23, m9, m16 ; out27 out26
+ psubsw m16, m9, m16 ; out36 out37
+ paddsw m22, m8, m17 ; out24 out25
+ psubsw m17, m8, m17 ; out39 out38
+ REPX {pmulhrsw x, m13}, m14, m15, m16, m17
+.transpose_round:
+%macro TRANSPOSE_8x4_PACKED 4
+ punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
+ punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
+ punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1
+ punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3
+ punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1
+ punpcklwd m%3, m%4, m%2 ; 2
+ punpckhwd m%4, m%2 ; 3
+ punpckhwd m%2, m%1, m8 ; 1
+ punpcklwd m%1, m8 ; 0
+%endmacro
+ TRANSPOSE_8x4_PACKED 0, 1, 2, 3
+ TRANSPOSE_8x4_PACKED 18, 19, 20, 21
+ TRANSPOSE_8x4_PACKED 4, 5, 6, 7
+ TRANSPOSE_8x4_PACKED 14, 15, 16, 17
+ vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03
+ vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01
+ vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13
+ vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11
+ vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23
+ vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21
+ vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33
+ vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31
+ vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03
+ vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01
+ vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13
+ vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11
+ vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23
+ vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21
+ vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33
+ vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31
+ ret
+.pass2:
+ vshufi32x4 m7, m5, m19, q3131 ; 14
+ vshufi32x4 m5, m19, q2020 ; 10
+ vshufi32x4 m21, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m20, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ vshufi32x4 m6, m8, m2, q3131 ; 12
+ vshufi32x4 m4, m8, m2, q2020 ; 8
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m16, q3131 ; 6
+ vshufi32x4 m1, m16, q2020 ; 2
+ vshufi32x4 m16, m9, m15, q3131 ; 5
+ vshufi32x4 m14, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m11, m17, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m5, m23, [cq+64*20]
+ pmulhrsw m3, m23, [cq+64*12]
+ pmulhrsw m1, m23, [cq+64* 4]
+ pmulhrsw m7, m23, [cq+64*28]
+ pmulhrsw m2, m23, [cq+64* 8]
+ pmulhrsw m6, m23, [cq+64*24]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m4, m23, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m23, [cq+64* 2]
+ pmulhrsw m21, m23, [cq+64*30]
+ pmulhrsw m18, m23, [cq+64*18]
+ pmulhrsw m17, m23, [cq+64*14]
+ pmulhrsw m16, m23, [cq+64*10]
+ pmulhrsw m19, m23, [cq+64*22]
+ pmulhrsw m20, m23, [cq+64*26]
+ pmulhrsw m15, m23, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ pmulhrsw m22, m23, [cq+64* 1]
+ pmulhrsw m21, m23, [cq+64*31]
+ pmulhrsw m14, m23, [cq+64*17]
+ pmulhrsw m29, m23, [cq+64*15]
+ pmulhrsw m26, m23, [cq+64* 9]
+ pmulhrsw m17, m23, [cq+64*23]
+ pmulhrsw m18, m23, [cq+64*25]
+ pmulhrsw m25, m23, [cq+64* 7]
+ pmulhrsw m24, m23, [cq+64* 5]
+ pmulhrsw m19, m23, [cq+64*27]
+ pmulhrsw m16, m23, [cq+64*21]
+ pmulhrsw m27, m23, [cq+64*11]
+ pmulhrsw m28, m23, [cq+64*13]
+ pmulhrsw m15, m23, [cq+64*19]
+ pmulhrsw m20, m23, [cq+64*29]
+ pmulhrsw m23, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m12, [o(pw_16384)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ REPX {pmulhrsw x, m12}, m7, m0, m2, m4
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m12}, m6, m8, m1, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m12}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ REPX {pmulhrsw x, m12}, m9, m3, m24, m26
+ punpckhqdq m5, m23, m27 ; d01 d09 d17 d25
+ punpcklqdq m23, m27 ; d00 d08 d16 d24
+ punpcklqdq m27, m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m25, m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m26, m9, m24 ; d06 d14 d22 d30
+ punpckhqdq m9, m24 ; d07 d15 d23 d31
+ mova [cq+64* 3], m23
+ mova [cq+64*13], m27
+ mova [cq+64* 7], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova [cq+64* 1], m0
+ mova [cq+64* 9], m7
+ mova [cq+64* 5], m8
+ mova [cq+64*11], m6
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m8, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m8, m19 ; 21
+ paddsw m8, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ psubsw m16, m4, m15 ; 17
+ paddsw m4, m15 ; 14
+ psubsw m15, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ punpcklwd m14, m15, m16
+ punpckhwd m15, m16
+ punpckhwd m16, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m12}, m28, m2, m8, m27
+ punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m12}, m4, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m15, m16
+ punpckldq m15, m16
+ REPX {pmulhrsw x, m12}, m26, m19, m21, m15
+ punpckhdq m16, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m14, m17
+ punpckldq m14, m17
+ REPX {pmulhrsw x, m12}, m16, m18, m20, m14
+ punpckhqdq m17, m28, m8 ; b03 b11 b19 b27
+ punpcklqdq m28, m8 ; b02 b10 b18 b26
+ punpckhqdq m8, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpcklqdq m27, m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m29, m4, m11 ; b06 b14 b22 b30
+ punpckhqdq m4, m11 ; b07 b15 b23 b31
+ mova [cq+64* 0], m2
+ mova [cq+64* 8], m28
+ mova [cq+64* 4], m27
+ mova [cq+64*10], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m14, m19 ; c01 c09 c17 c25
+ punpcklqdq m14, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m15, m18 ; c05 c13 c21 c29
+ punpcklqdq m15, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m16 ; c07 c15 c23 c31
+ punpcklqdq m21, m16 ; c06 c14 c22 c30
+ mova [cq+64* 2], m14
+ mova [cq+64*12], m20
+ mova [cq+64* 6], m15
+ mova [cq+64*14], m21
+ vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15
+ vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31
+ mov r4, rsp
+ vshufi32x4 m0, m22, m19, q2020 ; 1
+ vshufi32x4 m1, m17, m29, q3131 ; 31
+ vshufi32x4 m2, m14, m26, q2020 ; 17
+ vshufi32x4 m3, m25, m18, q3131 ; 15
+ call .main_part1
+ vshufi32x4 m0, m25, m18, q2020 ; 7
+ vshufi32x4 m1, m14, m26, q3131 ; 25
+ vshufi32x4 m2, m17, m29, q2020 ; 23
+ vshufi32x4 m3, m22, m19, q3131 ; 9
+ call .main_part1
+ vshufi32x4 m0, m24, m21, q2020 ; 5
+ vshufi32x4 m1, m15, m27, q3131 ; 27
+ vshufi32x4 m2, m16, m28, q2020 ; 21
+ vshufi32x4 m3, m23, m20, q3131 ; 11
+ call .main_part1
+ vshufi32x4 m0, m23, m20, q2020 ; 3
+ vshufi32x4 m1, m16, m28, q3131 ; 29
+ vshufi32x4 m2, m15, m27, q2020 ; 19
+ vshufi32x4 m3, m24, m21, q3131 ; 13
+ call .main_part1
+ call .main_part2
+ mova m0, [cq+64* 1] ; a0
+ mova m15, [cq+64* 0] ; b0
+ mova m3, [cq+64* 2] ; c0
+ mova m16, [cq+64* 3] ; d0
+ mova m14, [cq+64* 5] ; a4
+ mova m8, [cq+64* 4] ; b4
+ mova m17, [cq+64* 6] ; c4
+ mova m1, [cq+64* 7] ; d4
+ vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08
+ vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08
+ vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12
+ vshufi32x4 m1, m0, m3, q3131 ; 8
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m2, m15, q3131 ; 24
+ vshufi32x4 m2, m15, q2020 ; 16
+ vshufi32x4 m15, m14, m17, q3131 ; 12
+ vshufi32x4 m14, m17, q2020 ; 4
+ vshufi32x4 m17, m16, m8, q3131 ; 28
+ vshufi32x4 m16, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64*12]
+ mova m11, [cq+64*10]
+ mova m12, [cq+64*14]
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 9]
+ mova m27, [cq+64*13]
+ mova m23, [cq+64*11]
+ mova m24, [cq+64*15]
+ vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10
+ vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10
+ vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14
+ vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14
+ vshufi32x4 m28, m26, m8, q3131 ; 26
+ vshufi32x4 m26, m8, q2020 ; 18
+ vshufi32x4 m24, m22, m9, q3131 ; 10
+ vshufi32x4 m22, m9, q2020 ; 2
+ vshufi32x4 m29, m27, m11, q3131 ; 30
+ vshufi32x4 m27, m11, q2020 ; 22
+ vshufi32x4 m25, m23, m12, q3131 ; 14
+ vshufi32x4 m23, m12, q2020 ; 6
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ jmp .end
+.fast: ; bottom/right halves are zero
+ pmulhrsw ym9, ym23, [cq+64* 0]
+ pmulhrsw ym6, ym23, [cq+64* 8]
+ mova m14, [o(dup16_perm)]
+ pmulhrsw ym8, ym23, [cq+64* 2]
+ pmulhrsw xm0, xm23, [cq+64*14]
+ pmulhrsw xm5, xm23, [cq+64*10]
+ pmulhrsw ym1, ym23, [cq+64* 6]
+ pmulhrsw ym7, ym23, [cq+64* 4]
+ pmulhrsw xm3, xm23, [cq+64*12]
+ pmovzxwd m9, ym9
+ pmovzxwd m6, ym6
+ vpermb m8, m14, m8
+ punpcklwd xm0, xm0
+ vpermb ym5, ym14, ym5
+ vpermb m1, m14, m1
+ vpermb m7, m14, m7
+ punpcklwd xm3, xm3
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpmulhrsw ym21, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which
+ {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to
+ {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements
+ {evex}vpmulhrsw ym18, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm16, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm19, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m21, m14, m21
+ punpcklwd xm17, xm17
+ vpermb ym20, ym14, ym20
+ vpermb m15, m14, m15
+ vpermb m18, m14, m18
+ vpermb ym16, ym14, ym16
+ punpcklwd xm19, xm19
+ vpermb m14, m14, m23
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m16, m0, m3, q2020 ; 0
+ vshufi32x4 m26, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m14, m2, q2020 ; 1
+ vshufi32x4 m14, m2, q3131 ; 5
+ vshufi32x4 m3, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m27, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ vshufi32x4 m28, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m22, m1, m18, q2020 ; 2
+ vshufi32x4 m23, m1, m18, q3131 ; 6
+ vshufi32x4 m24, m5, m15, q2020 ; 10
+ vshufi32x4 m25, m5, m15, q3131 ; 14
+ vshufi32x4 m15, m21, m4, q3131 ; 12
+ vshufi32x4 m21, m21, m4, q2020 ; 8
+ mov r4, rsp
+ call .main_part1_fast
+ mova m0, m17
+ mova m3, m28
+ call .main_part1_fast
+ mova m0, m14
+ mova m3, m19
+ call .main_part1_fast
+ mova m0, m27
+ mova m3, m20
+ call .main_part1_fast
+ call .main_part2
+ mova m0, m16
+ mova m1, m21
+ mova m14, m26
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+64*14], m21
+ mova [cq+64* 0], m14
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64* 4], m16
+ mova [cq+64* 2], m15
+ mova [cq+64*12], m20
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r5, [r4+strideq] ; stride*4
+ lea r3, [dstq+r4*8]
+ lea r6, [strideq+r5*8] ; stride*33
+ lea r8, [r4+r5*8] ; stride*35
+ add r3, r5 ; dst+stride*28
+ lea r7, [r6+strideq] ; stride*34
+%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
+%if %2 < 8
+ paddsw m10, m%2, m%1
+ psubsw m11, m%2, m%1
+%else
+ mova m11, [cq+64*(%2*2-16)]
+ paddsw m10, m11, m%1
+ psubsw m11, m%1
+%endif
+ mova m9, [rsp+64*(31-%2)]
+ mova m%1, [rsp+64*%2]
+ paddsw m8, m10, m9
+ psubsw m10, m9
+ paddsw m9, m11, m%1
+ pmovzxbw m0, [dstq+%3]
+ psubsw m11, m%1
+ pmovzxbw m%1, [r3 +%4]
+ REPX {pmulhrsw x, m12}, m8, m10, m9, m11
+ paddw m8, m0
+ pmovzxbw m0, [r3 +%5]
+ paddw m10, m%1
+ pmovzxbw m%1, [dstq+%6]
+ paddw m9, m0
+ paddw m11, m%1
+%if %2 >= 8
+%if %2 == 8
+ pxor m1, m1
+%endif
+ mova [cq+64*(%2*2-16)], m1
+ mova [cq+64*(%2*2-15)], m1
+%endif
+ packuswb m8, m10
+ packuswb m9, m11
+ vpermq m8, m13, m8
+ vpermq m9, m13, m9
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+ mova [r3 +%5], ym9
+ vextracti32x8 [dstq+%6], m9, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+ IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align ; bottom three-quarters are zero
+.main_part1_fast:
+ vpbroadcastd m1, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m2, [o(idct64_mul+4*6)]
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m1, m0 ; t63a
+ pmulhrsw m0, m8 ; t32a
+ pmulhrsw m2, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ mova m8, m0
+ mova m7, m1
+ mova m6, m3
+ mova m5, m2
+ jmp .main_part1b
+.main_part1:
+ ; idct64 steps 1-5:
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m6, [o(idct64_mul+4*2)]
+ vpbroadcastd m9, [o(idct64_mul+4*3)]
+ pmulhrsw m7, m0 ; t63a
+ vpbroadcastd m5, [o(idct64_mul+4*4)]
+ pmulhrsw m0, m8 ; t32a
+ vpbroadcastd m8, [o(idct64_mul+4*5)]
+ pmulhrsw m6, m1 ; t62a
+ vpbroadcastd m4, [o(idct64_mul+4*6)]
+ pmulhrsw m1, m9 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m5, m2 ; t61a
+ pmulhrsw m2, m8 ; t34a
+ pmulhrsw m4, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ psubsw m8, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m7, m6 ; t62
+ paddsw m7, m6 ; t63
+ psubsw m6, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m4, m5 ; t61
+ paddsw m5, m4 ; t60
+.main_part1b:
+ vpbroadcastd m11, [o(idct64_mul+4*8)]
+ vpbroadcastd m12, [o(idct64_mul+4*9)]
+ ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
+ vpbroadcastd m11, [o(idct64_mul+4*10)]
+ ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
+ vpbroadcastd m11, [o(idct64_mul+4*11)]
+ vpbroadcastd m12, [o(idct64_mul+4*12)]
+ psubsw m4, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m7, m5 ; t60a
+ paddsw m7, m5 ; t63a
+ psubsw m5, m1, m2 ; t34
+ paddsw m1, m2 ; t33
+ psubsw m2, m8, m6 ; t61
+ paddsw m6, m8 ; t62
+ add r5, 4*13
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60
+ ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
+ mova [r4+64*0], m0
+ mova [r4+64*7], m7
+ mova [r4+64*1], m1
+ mova [r4+64*6], m6
+ mova [r4+64*3], m3
+ mova [r4+64*4], m4
+ mova [r4+64*2], m2
+ mova [r4+64*5], m5
+ add r4, 64*8
+ ret
+.main_part2:
+ vpbroadcastd m11, [o(pw_1567_3784 -16*13)]
+ vpbroadcastd m12, [o(pw_m3784_1567 -16*13)]
+ lea r6, [r4+64*7]
+ vpbroadcastd m17, [o(pw_m1567_m3784-16*13)]
+ vpbroadcastd m18, [o(pw_2896_2896 -16*13)]
+ vpbroadcastd m19, [o(pw_m2896_2896 -16*13)]
+ sub r5, 16*13
+.main_part2_loop:
+ mova m0, [r4-64*32] ; t32a
+ mova m1, [r6-64*24] ; t39a
+ mova m2, [r6-64*32] ; t63a
+ mova m3, [r4-64*24] ; t56a
+ mova m4, [r4-64*16] ; t40a
+ mova m5, [r6-64* 8] ; t47a
+ mova m6, [r6-64*16] ; t55a
+ mova m7, [r4-64* 8] ; t48a
+ psubsw m8, m0, m1 ; t39
+ paddsw m0, m1 ; t32
+ psubsw m1, m2, m3 ; t56
+ paddsw m2, m3 ; t63
+ psubsw m3, m5, m4 ; t40
+ paddsw m5, m4 ; t47
+ psubsw m4, m7, m6 ; t55
+ paddsw m7, m6 ; t48
+ ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
+ ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
+ psubsw m6, m2, m7 ; t48a
+ paddsw m2, m7 ; t63a
+ psubsw m7, m0, m5 ; t47a
+ paddsw m0, m5 ; t32a
+ psubsw m5, m8, m3 ; t55
+ paddsw m8, m3 ; t56
+ psubsw m3, m1, m4 ; t40
+ paddsw m1, m4 ; t39
+ ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48
+ ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
+ mova [r6-64* 8], m2
+ mova [r4-64*32], m0
+ mova [r4-64* 8], m8
+ mova [r6-64*32], m1
+ mova [r6-64*24], m6
+ mova [r4-64*16], m7
+ mova [r4-64*24], m5
+ mova [r6-64*16], m3
+ add r4, 64
+ sub r6, 64
+ cmp r4, r6
+ jb .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m0, m23, [cq+64* 1]
+ pmulhrsw m1, m23, [cq+64*31]
+ pmulhrsw m2, m23, [cq+64*17]
+ pmulhrsw m3, m23, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 7]
+ pmulhrsw m1, m23, [cq+64*25]
+ pmulhrsw m2, m23, [cq+64*23]
+ pmulhrsw m3, m23, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 5]
+ pmulhrsw m1, m23, [cq+64*27]
+ pmulhrsw m2, m23, [cq+64*21]
+ pmulhrsw m3, m23, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 3]
+ pmulhrsw m1, m23, [cq+64*29]
+ pmulhrsw m2, m23, [cq+64*19]
+ pmulhrsw m3, m23, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ pmulhrsw m3, m23, [cq+64*24]
+ pmulhrsw m1, m23, [cq+64* 8]
+ pmulhrsw m2, m23, [cq+64*16]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m14, m23, [cq+64* 4]
+ pmulhrsw m17, m23, [cq+64*28]
+ pmulhrsw m16, m23, [cq+64*20]
+ pmulhrsw m15, m23, [cq+64*12]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ pmulhrsw m22, m23, [cq+64* 2]
+ pmulhrsw m29, m23, [cq+64*30]
+ pmulhrsw m26, m23, [cq+64*18]
+ pmulhrsw m25, m23, [cq+64*14]
+ pmulhrsw m24, m23, [cq+64*10]
+ pmulhrsw m27, m23, [cq+64*22]
+ pmulhrsw m28, m23, [cq+64*26]
+ pmulhrsw m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_16384)]
+ call .pass1_end_part1
+ mova [cq+64*16], m1
+ mova [cq+64*17], m3
+ mova [cq+64*18], m5
+ mova [cq+64*19], m7
+ mova [cq+64*24], m23
+ mova [cq+64*25], m25
+ mova [cq+64*26], m27
+ mova [cq+64*27], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
+ call .pass1_end_part2
+ mova [cq+64*20], m15
+ mova [cq+64*21], m17
+ mova [cq+64*22], m19
+ mova [cq+64*23], m21
+ mova [cq+64*28], m1
+ mova [cq+64*29], m3
+ mova [cq+64*30], m5
+ mova [cq+64*31], m7
+ REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
+ REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6
+ vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03
+ vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63
+ vshufi32x4 m2, m3, m15, q3131 ; 8
+ vshufi32x4 m0, m3, m15, q2020 ; 0
+ vshufi32x4 m6, m23, m22, q3131 ; 24
+ vshufi32x4 m4, m23, m22, q2020 ; 16
+ vshufi32x4 m3, m1, m18, q3131 ; 12
+ vshufi32x4 m1, m18, q2020 ; 4
+ vshufi32x4 m7, m27, m26, q3131 ; 28
+ vshufi32x4 m5, m27, m26, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m17, q3131 ; 10
+ vshufi32x4 m14, m17, q2020 ; 2
+ vshufi32x4 m17, m19, m20, q3131 ; 14
+ vshufi32x4 m15, m19, m20, q2020 ; 6
+ vshufi32x4 m20, m25, m24, q3131 ; 26
+ vshufi32x4 m18, m25, m24, q2020 ; 18
+ vshufi32x4 m21, m29, m28, q3131 ; 30
+ vshufi32x4 m19, m29, m28, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ pmulhrsw m22, m13, [cq+64*16] ; a1
+ pmulhrsw m23, m13, [cq+64*20] ; c1
+ pmulhrsw m24, m13, [cq+64*24] ; e1
+ pmulhrsw m25, m13, [cq+64*28] ; g1
+ pmulhrsw m26, m13, [cq+64*17] ; a3
+ pmulhrsw m27, m13, [cq+64*21] ; c3
+ pmulhrsw m28, m13, [cq+64*25] ; e3
+ pmulhrsw m29, m13, [cq+64*29] ; g3
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ pmulhrsw m14, m13, [cq+64*18] ; a5
+ pmulhrsw m15, m13, [cq+64*22] ; c5
+ pmulhrsw m16, m13, [cq+64*26] ; e5
+ pmulhrsw m17, m13, [cq+64*30] ; g5
+ pmulhrsw m18, m13, [cq+64*19] ; a7
+ pmulhrsw m19, m13, [cq+64*23] ; c7
+ pmulhrsw m20, m13, [cq+64*27] ; e7
+ pmulhrsw m21, m13, [cq+64*31] ; g7
+ vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11
+ vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13
+ vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11
+ vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13
+ vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31
+ vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33
+ vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31
+ vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51
+ vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53
+ vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51
+ vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53
+ vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71
+ vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73
+ vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71
+ vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73
+ vshufi32x4 m27, m23, m11, q3131 ; 11 m27
+ vshufi32x4 m23, m11, q2020 ; 3 m23
+ vshufi32x4 m19, m26, m28, q3131 ; 27 m19
+ vshufi32x4 m15, m26, m28, q2020 ; 19 m15
+ vshufi32x4 m29, m25, m17, q3131 ; 15 m29
+ vshufi32x4 m25, m17, q2020 ; 7 m25
+ vshufi32x4 m21, m18, m20, q3131 ; 31 m21
+ vshufi32x4 m17, m18, m20, q2020 ; 23 m17
+ vshufi32x4 m20, m14, m16, q3131 ; 29 m20
+ vshufi32x4 m16, m14, m16, q2020 ; 21 m16
+ vshufi32x4 m18, m22, m24, q3131 ; 25 m18
+ vshufi32x4 m14, m22, m24, q2020 ; 17 m14
+ vshufi32x4 m26, m8, m9, q3131 ; 9 m26
+ vshufi32x4 m22, m8, m9, q2020 ; 1 m22
+ vshufi32x4 m28, m12, m13, q3131 ; 13 m28
+ vshufi32x4 m24, m12, m13, q2020 ; 5 m24
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m13, [o(pw_16384)]
+ pmulhrsw m0, m13, [r4-64*21]
+ pmulhrsw m1, m13, [r4-64*22]
+ pmulhrsw m2, m13, [r4-64*23]
+ pmulhrsw m3, m13, [r4-64*24]
+ pmulhrsw m4, m13, [r4-64*25]
+ pmulhrsw m5, m13, [r4-64*26]
+ pmulhrsw m6, m13, [r4-64*27]
+ pmulhrsw m7, m13, [r4-64*28]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4-64*12]
+ pmulhrsw m15, m13, [r4-64*11]
+ pmulhrsw m16, m13, [r4-64*10]
+ pmulhrsw m17, m13, [r4-64* 9]
+ pmulhrsw m18, m13, [r4-64* 8]
+ pmulhrsw m19, m13, [r4-64* 7]
+ pmulhrsw m20, m13, [r4-64* 6]
+ pmulhrsw m21, m13, [r4-64* 5]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call .transpose_2x8x8_lo
+ mova [r4-64*12], m1
+ mova [r4-64*11], m3
+ mova [r4-64*10], m5
+ mova [r4-64* 9], m7
+ mova [r4-64* 8], m15
+ mova [r4-64* 7], m17
+ mova [r4-64* 6], m19
+ mova [r4-64* 5], m21
+ vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01
+ vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03
+ vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21
+ vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23
+ vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41
+ vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43
+ vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61
+ vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63
+ pmulhrsw m0, m13, [r4-64*20]
+ pmulhrsw m1, m13, [r4-64*19]
+ pmulhrsw m2, m13, [r4-64*18]
+ pmulhrsw m3, m13, [r4-64*17]
+ pmulhrsw m4, m13, [r4-64*16]
+ pmulhrsw m5, m13, [r4-64*15]
+ pmulhrsw m6, m13, [r4-64*14]
+ pmulhrsw m7, m13, [r4-64*13]
+ pmulhrsw m14, m13, [r4-64*29]
+ pmulhrsw m15, m13, [r4-64*30]
+ pmulhrsw m16, m13, [r4-64*31]
+ pmulhrsw m17, m13, [r4-64*32]
+ pmulhrsw m18, m13, [r4-64*33]
+ pmulhrsw m19, m13, [r4-64*34]
+ pmulhrsw m20, m13, [r4-64*35]
+ pmulhrsw m21, m13, [r4-64*36]
+ call .transpose_2x8x8_lo
+ mova [r4-64*20], m1
+ mova [r4-64*19], m3
+ mova [r4-64*18], m5
+ mova [r4-64*17], m7
+ mova [r4-64*16], m15
+ mova [r4-64*15], m17
+ mova [r4-64*14], m19
+ mova [r4-64*13], m21
+ vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41
+ vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43
+ vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03
+ vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01
+ vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21
+ vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23
+ vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61
+ vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63
+ vshufi32x4 m2, m0, m22, q3131 ; 8
+ vshufi32x4 m0, m22, q2020 ; 0
+ vshufi32x4 m3, m1, m26, q3131 ; 12
+ vshufi32x4 m1, m26, q2020 ; 4
+ vshufi32x4 m6, m4, m23, q3131 ; 24
+ vshufi32x4 m4, m23, q2020 ; 16
+ vshufi32x4 m7, m5, m27, q3131 ; 28
+ vshufi32x4 m5, m27, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m24, q3131 ; 10
+ vshufi32x4 m14, m24, q2020 ; 2
+ vshufi32x4 m17, m15, m28, q3131 ; 14
+ vshufi32x4 m15, m28, q2020 ; 6
+ vshufi32x4 m20, m18, m25, q3131 ; 26
+ vshufi32x4 m18, m25, q2020 ; 18
+ vshufi32x4 m21, m19, m29, q3131 ; 30
+ vshufi32x4 m19, m29, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m22, [r4-64*20]
+ mova m26, [r4-64*16]
+ mova m23, [r4-64*19]
+ mova m27, [r4-64*15]
+ mova m24, [r4-64*18]
+ mova m28, [r4-64*14]
+ mova m25, [r4-64*17]
+ mova m29, [r4-64*13]
+ mova [r4-64*20], m14
+ mova [r4-64*19], m15
+ mova [r4-64*18], m16
+ mova [r4-64*17], m17
+ mova [r4-64*16], m18
+ mova [r4-64*15], m19
+ mova [r4-64*14], m20
+ mova [r4-64*13], m21
+ mova m19, [r4-64*12]
+ mova m11, [r4-64* 8]
+ mova m20, [r4-64*11]
+ mova m12, [r4-64* 7]
+ mova m21, [r4-64*10]
+ mova m8, [r4-64* 6]
+ mova m9, [r4-64* 9]
+ mova m18, [r4-64* 5]
+ vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13
+ vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11
+ vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33
+ vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31
+ vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53
+ vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51
+ vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73
+ vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71
+ vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11
+ vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13
+ vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31
+ vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33
+ vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51
+ vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53
+ vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71
+ vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73
+ vshufi32x4 m26, m22, m27, q3131 ; 9
+ vshufi32x4 m22, m27, q2020 ; 1
+ vshufi32x4 m27, m23, m28, q3131 ; 11
+ vshufi32x4 m23, m28, q2020 ; 3
+ vshufi32x4 m28, m24, m29, q3131 ; 13
+ vshufi32x4 m24, m29, q2020 ; 5
+ vshufi32x4 m29, m25, m8, q3131 ; 15
+ vshufi32x4 m25, m8, q2020 ; 7
+ vshufi32x4 m18, m14, m19, q3131 ; 25
+ vshufi32x4 m14, m19, q2020 ; 17
+ vshufi32x4 m19, m15, m20, q3131 ; 27
+ vshufi32x4 m15, m20, q2020 ; 19
+ vshufi32x4 m20, m16, m21, q3131 ; 29
+ vshufi32x4 m16, m21, q2020 ; 21
+ vshufi32x4 m21, m17, m9, q3131 ; 31
+ vshufi32x4 m17, m9, q2020 ; 23
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ {evex}vpmulhrsw ym8, ym23, [cq+64* 4]
+ {evex}vpmulhrsw xm1, xm23, [cq+64*12]
+ mova m28, [o(dup16_perm)]
+ {evex}vpmulhrsw ym7, ym23, [cq+64* 8]
+ vpmulhrsw ym22, ym23, [cq+64* 0]
+ vpermb m8, m28, m8
+ vpermb ym1, ym28, ym1
+ vpermb m7, m28, m7
+ pmovzxwd m9, ym22
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ {evex}vpmulhrsw ym21, ym23, [cq+64* 2]
+ {evex}vpmulhrsw xm15, xm23, [cq+64*14]
+ {evex}vpmulhrsw xm18, xm23, [cq+64*10]
+ {evex}vpmulhrsw ym14, ym23, [cq+64* 6]
+ vpermb m21, m28, m21
+ punpcklwd xm15, xm15
+ vpermb ym18, ym28, ym18
+ vpermb m14, m28, m14
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpmulhrsw ym22, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm29, xm23, [cq+64*15]
+ {evex}vpmulhrsw xm26, xm23, [cq+64* 9]
+ {evex}vpmulhrsw ym25, ym23, [cq+64* 7]
+ {evex}vpmulhrsw ym24, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm27, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm8, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m22, m28, m22
+ punpcklwd xm29, xm29
+ vpermb ym26, ym28, ym26
+ vpermb m25, m28, m25
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ REPX {vpermb x, m28, x}, m24, m27, m23
+ punpcklwd xm28, xm8, xm8
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ mov r4, rsp
+ vpbroadcastd m13, [o(pw_16384)]
+ mova [r4+64*16], m4
+ mova [r4+64*17], m5
+ mova [r4+64*18], m6
+ mova [r4+64*19], m7
+ mova [r4+64*28], m26
+ mova [r4+64*29], m27
+ mova [r4+64*30], m28
+ mova [r4+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [r4+64*20], m22
+ mova [r4+64*21], m23
+ mova [r4+64*22], m24
+ mova [r4+64*23], m25
+ mova [r4+64*24], m26
+ mova [r4+64*25], m27
+ mova [r4+64*26], m28
+ mova [r4+64*27], m29
+ call .pass2_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [r4+64*16]
+ pmulhrsw m1, m13, [r4+64*17]
+ pmulhrsw m2, m13, [r4+64*18]
+ pmulhrsw m3, m13, [r4+64*19]
+ pmulhrsw m4, m13, [r4+64*20]
+ pmulhrsw m5, m13, [r4+64*21]
+ pmulhrsw m6, m13, [r4+64*22]
+ pmulhrsw m7, m13, [r4+64*23]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4+64*24]
+ pmulhrsw m15, m13, [r4+64*25]
+ pmulhrsw m16, m13, [r4+64*26]
+ pmulhrsw m17, m13, [r4+64*27]
+ pmulhrsw m18, m13, [r4+64*28]
+ pmulhrsw m19, m13, [r4+64*29]
+ pmulhrsw m20, m13, [r4+64*30]
+ pmulhrsw m21, m13, [r4+64*31]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ call .pass2_fast
+ mova [r4+64*16], m14
+ mova [r4+64*17], m15
+ mova [r4+64*18], m16
+ mova [r4+64*19], m17
+ mova [r4+64*20], m18
+ mova [r4+64*21], m19
+ mova [r4+64*22], m20
+ mova [r4+64*23], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+.end:
+ vpbroadcastd m13, [o(pw_2048)]
+ lea r5, [strideq*3]
+ pxor m12, m12
+ lea r3, [dstq+r5*8]
+ lea r6, [strideq+r5] ; stride*4
+ add r3, r6 ; dst+stride*28
+%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
+ mova m11, [cq+64*( %3)] ; 0
+ mova m9, [cq+64*(31-%3)] ; 31
+%if %3 >= 8
+ mova m%1, [rsp+64*(%1+16)]
+%endif
+ mova m10, [dstq+%4]
+ paddsw m8, m11, m9
+ psubsw m11, m9
+ paddsw m9, m%1, m%2
+ psubsw m%1, m%2
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ pmulhrsw m8, m13
+ pmulhrsw m9, m13
+ paddw m8, m%2
+ paddw m9, m10
+ mova m10, [r3+%5]
+ pmulhrsw m11, m13
+ pmulhrsw m%1, m13
+ mova [cq+64*( %3)], m12
+ mova [cq+64*(31-%3)], m12
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ packuswb m8, m9
+ paddw m11, m%2
+ paddw m%1, m10
+ packuswb m11, m%1
+ mova [dstq+%4], m8
+ mova [r3 +%5], m11
+%if %3 == 3 || %3 == 7 || %3 == 11
+ add dstq, r6
+ sub r3, r6
+%endif
+%endmacro
+ IDCT_64x32_END 0, 29, 0, strideq*0, r5
+ IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2
+ IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1
+ IDCT_64x32_END 3, 26, 3, r5 , strideq*0
+ IDCT_64x32_END 4, 25, 4, strideq*0, r5
+ IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2
+ IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1
+ IDCT_64x32_END 7, 22, 7, r5 , strideq*0
+ IDCT_64x32_END 0, 21, 8, strideq*0, r5
+ IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2
+ IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1
+ IDCT_64x32_END 3, 18, 11, r5 , strideq*0
+ IDCT_64x32_END 4, 17, 12, strideq*0, r5
+ IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2
+ IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1
+ IDCT_64x32_END 7, 14, 15, r5 , strideq*0
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
+ALIGN function_align
+.pass1_end_part1:
+%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
+%if %1 != %3
+ mova m%1, [cq+64*%1]
+%endif
+ mova m9, [r4+64*(%3-36)] ; idct64 32+n
+ mova m11, [r4+64*(-5-%3)] ; idct64 63-n
+ psubsw m8, m%1, m%2 ; idct32 31-n
+ paddsw m%1, m%2 ; idct32 0+n
+%if %1 == %3
+ psubsw m%2, m8, m9 ; out 32+n e
+ paddsw m8, m9 ; out 31-n d
+ psubsw m9, m%1, m11 ; out 63-n h
+ paddsw m%1, m11 ; out 0+n a
+%else
+ paddsw m%2, m8, m9 ; out 23-n c
+ psubsw m8, m9 ; out 40+n f
+ paddsw m9, m%1, m11 ; out 8+n b
+ psubsw m%1, m11 ; out 55-n g
+%endif
+ mova [r4+64*(%3-36)], m8
+ mova [r4+64*(-5-%3)], m9
+%endmacro
+ IDCT_64x32_PASS1_END 0, 29, 0
+ IDCT_64x32_PASS1_END 1, 28, 1
+ IDCT_64x32_PASS1_END 2, 27, 2
+ IDCT_64x32_PASS1_END 3, 26, 3
+ IDCT_64x32_PASS1_END 4, 25, 4
+ IDCT_64x32_PASS1_END 5, 24, 5
+ IDCT_64x32_PASS1_END 6, 23, 6
+ IDCT_64x32_PASS1_END 7, 22, 7
+.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
+ punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
+ punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
+ punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckhqdq m23, m22, m27 ; 1 23
+ punpcklqdq m22, m27 ; 0 22
+ punpckhqdq m27, m26, m28 ; 5 27
+ punpcklqdq m26, m28 ; 4 26
+ punpcklqdq m28, m29, m25 ; 6 28
+ punpckhqdq m29, m25 ; 7 29
+ punpckhqdq m25, m24, m8 ; 3 25
+ punpcklqdq m24, m8 ; 2 24
+.transpose_8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+.pass1_end_part2:
+ IDCT_64x32_PASS1_END 0, 21, 8
+ IDCT_64x32_PASS1_END 1, 20, 9
+ IDCT_64x32_PASS1_END 2, 19, 10
+ IDCT_64x32_PASS1_END 3, 18, 11
+ IDCT_64x32_PASS1_END 4, 17, 12
+ IDCT_64x32_PASS1_END 5, 16, 13
+ IDCT_64x32_PASS1_END 6, 15, 14
+ IDCT_64x32_PASS1_END 7, 14, 15
+.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
+ punpcklwd m8, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m5, m4
+ punpckhwd m5, m4
+ punpckldq m4, m7, m5
+ punpckhdq m7, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m3, m1
+ punpckhdq m3, m1
+ punpckhqdq m1, m0, m5
+ punpcklqdq m0, m5
+ punpckhqdq m5, m4, m6
+ punpcklqdq m4, m6
+ punpcklqdq m6, m7, m3
+ punpckhqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ punpckhwd m8, m18, m19
+ punpcklwd m18, m19
+ punpckhwd m19, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m20, m21
+ punpcklwd m20, m21
+ punpckhwd m21, m16, m17
+ punpcklwd m16, m17
+ punpckhdq m17, m14, m16
+ punpckldq m14, m16
+ punpckldq m16, m18, m20
+ punpckhdq m18, m20
+ punpckhdq m20, m19, m21
+ punpckldq m19, m21
+ punpckldq m21, m8, m15
+ punpckhdq m8, m15
+ punpckhqdq m15, m14, m16
+ punpcklqdq m14, m16
+ punpcklqdq m16, m17, m18
+ punpckhqdq m17, m18
+ punpcklqdq m18, m19, m21
+ punpckhqdq m19, m21
+ punpckhqdq m21, m20, m8
+ punpcklqdq m20, m8
+ ret
+.pass2_fast:
+ vshufi32x4 m24, m9, m15, q3131 ; 5
+ vshufi32x4 m22, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m1, m16, q3131 ; 6
+ vshufi32x4 m14, m1, m16, q2020 ; 2
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m8, m2, q3131 ; 12
+ vshufi32x4 m2, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m11, m17, q3131 ; 7
+ vshufi32x4 m23, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m5, m19, q3131 ; 14
+ vshufi32x4 m16, m5, m19, q2020 ; 10
+ vshufi32x4 m29, m6, m20, q3131 ; 15
+ vshufi32x4 m27, m6, m20, q2020 ; 11
+ vshufi32x4 m28, m4, m18, q3131 ; 13
+ vshufi32x4 m26, m4, m18, q2020 ; 9
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ mova m14, [cq+64* 4]
+ mova m15, [cq+64*12]
+ mova m16, [cq+64*20]
+ mova m17, [cq+64*28]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m22, [cq+64* 2]
+ mova m29, [cq+64*30]
+ mova m26, [cq+64*18]
+ mova m25, [cq+64*14]
+ mova m24, [cq+64*10]
+ mova m27, [cq+64*22]
+ mova m28, [cq+64*26]
+ mova m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
+ mova [r4+64*36], m1
+ mova [r4+64*37], m3
+ mova [r4+64*38], m5
+ mova [r4+64*39], m7
+ mova [r4+64*44], m23
+ mova [r4+64*45], m25
+ mova [r4+64*46], m27
+ mova [r4+64*47], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
+ lea r6, [r4-64*4]
+ add r4, 64*28
+ call .pass2_end
+ mov r4, rsp
+ mova m0, [r4+64*23]
+ mova m1, [r4+64*22]
+ mova m2, [r4+64*21]
+ mova m3, [r4+64*20]
+ mova m4, [r4+64*19]
+ mova m5, [r4+64*18]
+ mova m6, [r4+64*17]
+ mova m7, [r4+64*16]
+ mova m22, [r4+64*15]
+ mova m23, [r4+64*14]
+ mova m24, [r4+64*13]
+ mova m25, [r4+64*12]
+ mova m26, [r4+64*11]
+ mova m27, [r4+64*10]
+ mova m28, [r4+64* 9]
+ mova m29, [r4+64* 8]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r4+64* 8], m1
+ mova [r4+64* 9], m3
+ mova [r4+64*10], m5
+ mova [r4+64*11], m7
+ mova [r4+64*16], m23
+ mova [r4+64*17], m25
+ mova [r4+64*18], m27
+ mova [r4+64*19], m29
+ pmulhrsw m23, m13, m0 ; b0
+ pmulhrsw m25, m13, m2 ; b2
+ pmulhrsw m27, m13, m4 ; b4
+ pmulhrsw m29, m13, m6 ; b6
+ mova m0, [r4+64*31]
+ mova m1, [r4+64*30]
+ mova m2, [r4+64*29]
+ mova m3, [r4+64*28]
+ mova m4, [r4+64*27]
+ mova m5, [r4+64*26]
+ mova m6, [r4+64*25]
+ mova m7, [r4+64*24]
+ mova m14, [r4+64* 7]
+ mova m15, [r4+64* 6]
+ mova m16, [r4+64* 5]
+ mova m17, [r4+64* 4]
+ mova m18, [r4+64* 3]
+ mova m19, [r4+64* 2]
+ mova m20, [r4+64* 1]
+ mova m21, [r4+64* 0]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
+ mov r6, cq
+ call .pass2_end
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m28, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ vpermb m8, m28, [cq+64* 4]
+ vpermb ym1, ym28, [cq+64*12]
+ vpermb m7, m28, [cq+64* 8]
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ vpermb m21, m28, [cq+64* 2]
+ vpermb ym15, ym28, [cq+64*14]
+ vpermb ym18, ym28, [cq+64*10]
+ vpermb m14, m28, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpermb m22, m28, [cq+64* 1]
+ vpermb ym29, ym28, [cq+64*15]
+ vpermb ym26, ym28, [cq+64* 9]
+ vpermb m25, m28, [cq+64* 7]
+ vpermb m24, m28, [cq+64* 5]
+ vpermb ym27, ym28, [cq+64*11]
+ vpermb m23, m28, [cq+64* 3]
+ vpermb ym28, ym28, [cq+64*13]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [cq+64*16], m4
+ mova [cq+64*17], m5
+ mova [cq+64*18], m6
+ mova [cq+64*19], m7
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [cq+64*20], m22
+ mova [cq+64*21], m23
+ mova [cq+64*22], m24
+ mova [cq+64*23], m25
+ mova [cq+64*24], m26
+ mova [cq+64*25], m27
+ mova [cq+64*26], m28
+ mova [cq+64*27], m29
+ lea r4, [rsp+64*64]
+ lea r3, [rsp+64*32]
+ call .pass2_fast
+ pmulhrsw m0, m13, [cq+64*16]
+ pmulhrsw m1, m13, [cq+64*17]
+ pmulhrsw m2, m13, [cq+64*18]
+ pmulhrsw m3, m13, [cq+64*19]
+ pmulhrsw m4, m13, [cq+64*20]
+ pmulhrsw m5, m13, [cq+64*21]
+ pmulhrsw m6, m13, [cq+64*22]
+ pmulhrsw m7, m13, [cq+64*23]
+ pmulhrsw m14, m13, [cq+64*24]
+ pmulhrsw m15, m13, [cq+64*25]
+ pmulhrsw m16, m13, [cq+64*26]
+ pmulhrsw m17, m13, [cq+64*27]
+ pmulhrsw m18, m13, [cq+64*28]
+ pmulhrsw m19, m13, [cq+64*29]
+ pmulhrsw m20, m13, [cq+64*30]
+ pmulhrsw m21, m13, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ mov r4, rsp
+ mov r3, cq
+ call .pass2_fast
+.end:
+ vpbroadcastd m17, [o(pw_2048)]
+ lea r5, [strideq*8]
+ mov r3, dstq
+ pxor m16, m16
+ sub r4, 64*5 ; rsp+64*31
+ mov r6, rsp
+.end_loop:
+ mova m2, [r6+64*32] ; idct16 0+n lo
+ mova m7, [r6+64*48] ; idct32 31-n lo
+ mova m6, [cq+64* 0] ; idct16 0+n hi
+ mova m0, [cq+64*16] ; idct32 31-n hi
+ mova m4, [r4+64*64] ; idct64 63-n lo
+ mova m1, [r4+64* 0] ; idct64 63-n hi
+ mova m5, [r6+64*64] ; idct64 32+n lo
+ mova m8, [r6+64* 0] ; idct64 32+n hi
+ sub r3, strideq
+ paddsw m3, m2, m7 ; idct32 0+n lo
+ mova m12, [dstq+r5*0]
+ psubsw m2, m7 ; idct32 31-n lo
+ mova m15, [r3 +r5*8]
+ paddsw m7, m6, m0 ; idct32 0+n hi
+ mova m13, [r3 +r5*4]
+ psubsw m6, m0 ; idct32 31-n hi
+ mova m14, [dstq+r5*4]
+ paddsw m0, m3, m4 ; out 0+n lo
+ add r6, 64
+ psubsw m3, m4 ; out 63-n lo
+ sub r4, 64
+ paddsw m4, m7, m1 ; out 0+n hi
+ mova [cq+64* 0], m16
+ psubsw m7, m1 ; out 63-n hi
+ mova [cq+64*16], m16
+ paddsw m1, m2, m5 ; out 31-n lo
+ add cq, 64
+ psubsw m2, m5 ; out 32+n lo
+ paddsw m5, m6, m8 ; out 31-n hi
+ psubsw m6, m8 ; out 32+n hi
+ pmulhrsw m0, m17
+ punpcklbw m8, m12, m16
+ pmulhrsw m4, m17
+ punpckhbw m12, m16
+ pmulhrsw m3, m17
+ punpcklbw m11, m15, m16
+ pmulhrsw m7, m17
+ punpckhbw m15, m16
+ pmulhrsw m1, m17
+ punpcklbw m9, m13, m16
+ pmulhrsw m5, m17
+ punpckhbw m13, m16
+ pmulhrsw m2, m17
+ punpcklbw m10, m14, m16
+ pmulhrsw m6, m17
+ punpckhbw m14, m16
+ paddw m0, m8
+ paddw m4, m12
+ packuswb m0, m4
+ paddw m3, m11
+ paddw m7, m15
+ packuswb m3, m7
+ paddw m1, m9
+ paddw m5, m13
+ packuswb m1, m5
+ paddw m2, m10
+ paddw m6, m14
+ packuswb m2, m6
+ mova [dstq+r5*0], m0
+ mova [r3 +r5*8], m3
+ mova [r3 +r5*4], m1
+ mova [dstq+r5*4], m2
+ add dstq, strideq
+ cmp r6, r4
+ jb .end_loop
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+ALIGN function_align
+.pass2_end:
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
+ mova [r4+64*20], m1
+ mova [r4+64*21], m3
+ mova [r4+64*22], m5
+ mova [r4+64*23], m7
+ vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03
+ mova [r4+64*12], m15
+ mova [r4+64*13], m17
+ mova [r4+64*14], m19
+ mova [r4+64*15], m21
+ vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63
+ vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m0, m1, m5, q2020 ; 0
+ vshufi32x4 m1, m5, q3131 ; 8
+ vshufi32x4 m2, m3, m14, q2020 ; 16
+ vshufi32x4 m3, m14, q3131 ; 24
+ vshufi32x4 m14, m15, m18, q2020 ; 4
+ vshufi32x4 m15, m18, q3131 ; 12
+ vshufi32x4 m16, m17, m19, q2020 ; 20
+ vshufi32x4 m17, m19, q3131 ; 28
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ vshufi32x4 m24, m22, m25, q3131 ; 10
+ vshufi32x4 m22, m25, q2020 ; 2
+ vshufi32x4 m25, m23, m28, q3131 ; 14
+ vshufi32x4 m23, m28, q2020 ; 6
+ vshufi32x4 m28, m26, m27, q3131 ; 26
+ vshufi32x4 m26, m27, q2020 ; 18
+ vshufi32x4 m27, m29, m13, q2020 ; 22
+ vshufi32x4 m29, m13, q3131 ; 30
+ mova [r6+64* 0], m0
+ mova [r6+64* 1], m1
+ mova [r6+64* 2], m2
+ mova [r6+64* 3], m3
+ mova [r6+64* 4], m4
+ mova [r6+64* 5], m5
+ mova [r6+64* 6], m6
+ mova [r6+64* 7], m7
+ mova [r6+64* 8], m14
+ mova [r6+64* 9], m15
+ mova [r6+64*10], m16
+ mova [r6+64*11], m17
+ mova [r6+64*12], m18
+ mova [r6+64*13], m19
+ mova [r6+64*14], m20
+ mova [r6+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r6+64*16], m29
+ mova [r6+64*17], m28
+ mova [r6+64*18], m27
+ mova [r6+64*19], m26
+ mova [r6+64*20], m25
+ mova [r6+64*21], m24
+ mova [r6+64*22], m23
+ mova [r6+64*23], m22
+ mova [r6+64*24], m21
+ mova [r6+64*25], m20
+ mova [r6+64*26], m19
+ mova [r6+64*27], m18
+ mova [r6+64*28], m17
+ mova [r6+64*29], m16
+ mova [r6+64*30], m15
+ mova [r6+64*31], m14
+ pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25
+ pmulhrsw m16, m13, [r4+64*12]
+ pmulhrsw m17, m13, [r4+64*16]
+ pmulhrsw m18, m13, [r4+64*20]
+ pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31
+ pmulhrsw m20, m13, [r4+64*15]
+ pmulhrsw m21, m13, [r4+64*19]
+ pmulhrsw m22, m13, [r4+64*23]
+ vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9
+ vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25
+ vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9
+ vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25
+ pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29
+ pmulhrsw m24, m13, [r4+64*14]
+ pmulhrsw m25, m13, [r4+64*18]
+ pmulhrsw m26, m13, [r4+64*22]
+ vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15
+ vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31
+ vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15
+ vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31
+ pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27
+ pmulhrsw m28, m13, [r4+64*13]
+ pmulhrsw m29, m13, [r4+64*17]
+ pmulhrsw m13, [r4+64*21]
+ vshufi32x4 m0, m14, m16, q2020 ; 1
+ vshufi32x4 m1, m19, m21, q3131 ; 31
+ vshufi32x4 m2, m15, m17, q2020 ; 17
+ vshufi32x4 m3, m18, m20, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m18, m20, q2020 ; 7
+ vshufi32x4 m1, m15, m17, q3131 ; 25
+ vshufi32x4 m2, m19, m21, q2020 ; 23
+ vshufi32x4 m3, m14, m16, q3131 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13
+ vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29
+ vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13
+ vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29
+ vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11
+ vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27
+ vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11
+ vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27
+ vshufi32x4 m0, m22, m24, q2020 ; 5
+ vshufi32x4 m1, m27, m29, q3131 ; 27
+ vshufi32x4 m2, m23, m25, q2020 ; 21
+ vshufi32x4 m3, m26, m28, q3131 ; 11
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m26, m28, q2020 ; 3
+ vshufi32x4 m1, m23, m25, q3131 ; 29
+ vshufi32x4 m2, m27, m29, q2020 ; 19
+ vshufi32x4 m3, m22, m24, q3131 ; 13
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ALIGN function_align
+.pass2_fast:
+ vshufi32x4 m23, m1, m16, q3131 ; 6
+ vshufi32x4 m22, m1, m16, q2020 ; 2
+ vshufi32x4 m14, m0, m3, q3131 ; 4
+ vshufi32x4 m26, m0, m3, q2020 ; 0
+ vshufi32x4 m28, m9, m15, q3131 ; 5
+ vshufi32x4 m0, m9, m15, q2020 ; 1
+ vshufi32x4 m16, m11, m17, q3131 ; 7
+ vshufi32x4 m29, m11, m17, q2020 ; 3
+ vshufi32x4 m15, m8, m2, q3131 ; 12
+ vshufi32x4 m27, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m5, m19, q3131 ; 14
+ vshufi32x4 m24, m5, m19, q2020 ; 10
+ vshufi32x4 m3, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m17, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m16
+ mova m3, m18
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m28
+ mova m3, m19
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m29
+ mova m3, m17
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, m26
+ mova m1, m27
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [r3+64* 0], m0
+ mova [r3+64* 1], m1
+ mova [r3+64* 2], m2
+ mova [r3+64* 3], m3
+ mova [r3+64* 4], m4
+ mova [r3+64* 5], m5
+ mova [r3+64* 6], m6
+ mova [r3+64* 7], m7
+ mova [r3+64* 8], m14
+ mova [r3+64* 9], m15
+ mova [r3+64*10], m16
+ mova [r3+64*11], m17
+ mova [r3+64*12], m18
+ mova [r3+64*13], m19
+ mova [r3+64*14], m20
+ mova [r3+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+ mova [r3+64*16], m29
+ mova [r3+64*17], m28
+ mova [r3+64*18], m27
+ mova [r3+64*19], m26
+ mova [r3+64*20], m25
+ mova [r3+64*21], m24
+ mova [r3+64*22], m23
+ mova [r3+64*23], m22
+ mova [r3+64*24], m21
+ mova [r3+64*25], m20
+ mova [r3+64*26], m19
+ mova [r3+64*27], m18
+ mova [r3+64*28], m17
+ mova [r3+64*29], m16
+ mova [r3+64*30], m15
+ mova [r3+64*31], m14
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm
new file mode 100644
index 0000000000..ec7e3a52f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_sse.asm
@@ -0,0 +1,6533 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+
+SECTION_RODATA 16
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+
+%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
+pw_%1_m%2: times 4 dw %1, -%2
+%if %3 != 2
+pw_%2_%1: times 4 dw %2, %1
+%endif
+%if %3
+pw_m%1_m%2: times 4 dw -%1, -%2
+%endif
+%endmacro
+
+;adst4
+pw_1321_3803: times 4 dw 1321, 3803
+pw_2482_m1321: times 4 dw 2482, -1321
+pw_3344_2482: times 4 dw 3344, 2482
+pw_3344_m3803: times 4 dw 3344, -3803
+pw_3344_m3344: times 4 dw 3344, -3344
+pw_0_3344 times 4 dw 0, 3344
+pw_m6688_m3803: times 4 dw -6688, -3803
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567, 1
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4017, 799, 1
+COEF_PAIR 201, 4091
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 4052, 601
+COEF_PAIR 2276, 3406, 1
+COEF_PAIR 4076, 401, 2
+COEF_PAIR 2598, 3166, 2
+COEF_PAIR 3612, 1931, 2
+COEF_PAIR 1189, 3920, 2
+
+pd_2048: times 4 dd 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pw_4096: times 8 dw 4096
+pw_16384: times 8 dw 16384
+pw_m16384: times 8 dw -16384
+pw_1697x16: times 8 dw 1697*16
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_3344x8: times 8 dw 3344*8
+pw_8192: times 8 dw 8192
+pw_m8192: times 8 dw -8192
+pw_5: times 8 dw 5
+pw_201x8: times 8 dw 201*8
+pw_4091x8: times 8 dw 4091*8
+pw_m2751x8: times 8 dw -2751*8
+pw_3035x8: times 8 dw 3035*8
+pw_1751x8: times 8 dw 1751*8
+pw_3703x8: times 8 dw 3703*8
+pw_m1380x8: times 8 dw -1380*8
+pw_3857x8: times 8 dw 3857*8
+pw_995x8: times 8 dw 995*8
+pw_3973x8: times 8 dw 3973*8
+pw_m2106x8: times 8 dw -2106*8
+pw_3513x8: times 8 dw 3513*8
+pw_2440x8: times 8 dw 2440*8
+pw_3290x8: times 8 dw 3290*8
+pw_m601x8: times 8 dw -601*8
+pw_4052x8: times 8 dw 4052*8
+
+pw_4095x8: times 8 dw 4095*8
+pw_101x8: times 8 dw 101*8
+pw_2967x8: times 8 dw 2967*8
+pw_m2824x8: times 8 dw -2824*8
+pw_3745x8: times 8 dw 3745*8
+pw_1660x8: times 8 dw 1660*8
+pw_3822x8: times 8 dw 3822*8
+pw_m1474x8: times 8 dw -1474*8
+pw_3996x8: times 8 dw 3996*8
+pw_897x8: times 8 dw 897*8
+pw_3461x8: times 8 dw 3461*8
+pw_m2191x8: times 8 dw -2191*8
+pw_3349x8: times 8 dw 3349*8
+pw_2359x8: times 8 dw 2359*8
+pw_4036x8: times 8 dw 4036*8
+pw_m700x8: times 8 dw -700*8
+pw_4065x8: times 8 dw 4065*8
+pw_501x8: times 8 dw 501*8
+pw_3229x8: times 8 dw 3229*8
+pw_m2520x8: times 8 dw -2520*8
+pw_3564x8: times 8 dw 3564*8
+pw_2019x8: times 8 dw 2019*8
+pw_3948x8: times 8 dw 3948*8
+pw_m1092x8: times 8 dw -1092*8
+pw_3889x8: times 8 dw 3889*8
+pw_1285x8: times 8 dw 1285*8
+pw_3659x8: times 8 dw 3659*8
+pw_m1842x8: times 8 dw -1842*8
+pw_3102x8: times 8 dw 3102*8
+pw_2675x8: times 8 dw 2675*8
+pw_4085x8: times 8 dw 4085*8
+pw_m301x8: times 8 dw -301*8
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
+%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4]
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rotate 5
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+
+ movd m%3, [%%row_adr1] ;dst0
+ movd m%5, [%%row_adr2] ;dst1
+ punpckldq m%3, m%5 ;high: dst1 :low: dst0
+ movd m%4, [%%row_adr3] ;dst2
+ movd m%5, [%%row_adr4] ;dst3
+ punpckldq m%4, m%5 ;high: dst3 :low: dst2
+
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+
+ paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0
+ paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2
+
+ packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+ movd [%%row_adr1], m%3 ;store dst0 + out0
+ pshuflw m%4, m%3, q1032
+ movd [%%row_adr2], m%4 ;store dst1 + out1
+ punpckhqdq m%3, m%3
+ movd [%%row_adr3], m%3 ;store dst2 + out2
+ psrlq m%3, 32
+ movd [%%row_adr4], m%3 ;store dst3 + out3
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ mova m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+
+ WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4
+ ret
+%endmacro
+
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
+%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
+%if %6 & 2
+ pmaddwd m%2, m%4, m%1
+ pmaddwd m%1, m%5
+%elif %6 & 1
+ pmaddwd m%2, m%1, [o(pw_%5_%4)]
+ pmaddwd m%1, [o(pw_%4_m%5)]
+%else
+ pmaddwd m%2, m%1, [o(pw_%4_m%5)]
+ pmaddwd m%1, [o(pw_%5_%4)]
+%endif
+ paddd m%2, m%3
+ paddd m%1, m%3
+ psrad m%2, 12
+ psrad m%1, 12
+%if %6 & 4 == 0
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
+ mova m3, [o(pd_2048)]
+ punpckhwd m2, m0, m1 ;unpacked in1 in3
+ punpcklwd m0, m1 ;unpacked in0 in2
+ ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
+ ITX_MUL2X_PACK 0, 1, 3, 2896, 2896
+ psubsw m1, m0, m2 ;high: out2 ;low: out3
+ paddsw m0, m2 ;high: out1 ;low: out0
+%endmacro
+
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+ call %%p1
+ RET
+%%end:
+%else
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd ;0
+ pmulhrsw m0, m1
+ mova m1, m0
+ TAIL_CALL m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0] ;high: in1 ;low: in0
+ mova m1, [coeffq+16*1] ;high: in3 ;low in2
+
+ IDCT4_1D_PACKED
+
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2 ;high: in1 ;low: in0
+ pshufb m1, m3, m2 ;high: in3 ;low :in2
+ jmp tx2q
+
+.pass2:
+ IDCT4_1D_PACKED
+
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call .main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 0, 1, 2, 3
+
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m2, m0, m1 ;unpacked in0 in2
+ punpckhwd m0, m1 ;unpacked in1 in3
+ mova m3, m0
+ pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+ pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
+ paddd m1, m0 ;t2
+ pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+ paddd m4, m0 ;t0 + t3
+ pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m0, [o(pd_2048)]
+ paddd m1, m0 ;t2 + 2048
+ paddd m2, m0
+ paddd m0, m4 ;t0 + t3 + 2048
+ paddd m5, m2 ;t1 + t3 + 2048
+ paddd m2, m4
+ paddd m2, m3 ;t0 + t1 - t3 + 2048
+ REPX {psrad x, 12}, m1, m0, m5, m2
+ packssdw m0, m5 ;high: out1 ;low: out0
+ packssdw m1, m2 ;high: out3 ;low: out3
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2 ;high: in3 ;low :in2
+ punpckhwd m1, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m0, m3
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ;low: in1 high: in3
+ punpcklqdq m0, m1 ;low: in0 high: in2
+ psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3
+ paddw m0, m3 ;low: in0 + in1 high: in2 + in3
+ punpckhqdq m2, m2 ;t2 t2
+ punpcklqdq m0, m0 ;t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1 ;t4 t4
+ psubw m1, m3 ;low: t1/out2 high: t3/out1
+ psubw m0, m1 ;high: out0
+ paddw m2, m1 ;low: out3
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ shufpd m0, m2, 0x01
+ ITX4_END 0, 3, 2, 1, 0
+
+%macro IDCT8_1D_PACKED 0
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m0, m3 ;unpacked in1 in7
+ punpcklwd m0, m2 ;unpacked in0 in4
+ punpckhwd m2, m1 ;unpacked in5 in3
+ punpcklwd m1, m3 ;unpacked in2 in6
+ ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a
+ ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a
+ ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2
+ psubsw m3, m4, m2 ;low: t6a high: t5a
+ paddsw m4, m2 ;low: t7 high: t4
+ pshufb m3, [o(deint_shuf1)]
+ ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1
+ ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5
+ psubsw m2, m0, m1 ;low: tmp3 high: tmp2
+ paddsw m0, m1 ;low: tmp0 high: tmp1
+ punpcklqdq m1, m4, m3 ;low: t7 high: t6
+ punpckhqdq m4, m3 ;low: t4 high: t5
+ psubsw m3, m0, m1 ;low: out7 high: out6
+ paddsw m0, m1 ;low: out0 high: out1
+ paddsw m1, m2, m4 ;low: out3 high: out2
+ psubsw m2, m4 ;low: out4 high: out5
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+ punpckhwd m%4, m%1, m%2
+ punpcklwd m%1, m%2
+%if %7 < 8
+ pmaddwd m%2, m%7, m%1
+ pmaddwd m%3, m%7, m%4
+%else
+ mova m%2, [o(pw_%7_%6)]
+%if %8
+ pmaddwd m%3, m%1, m%2
+ pmaddwd m%2, m%4
+%else
+ pmaddwd m%3, m%4, m%2
+ pmaddwd m%2, m%1
+%endif
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %8
+ packssdw m%3, m%2
+%else
+ packssdw m%2, m%3 ;dst2
+%endif
+%if %7 < 8
+ pmaddwd m%4, m%6
+ pmaddwd m%1, m%6
+%elif %8
+ mova m%2, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%2
+ pmaddwd m%1, m%2
+%else
+ mova m%3, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%3
+ pmaddwd m%1, m%3
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+ psubsw m%3, m%1, m%2 ;out2
+ paddsw m%2, m%1 ;out1
+ paddsw m%1, m%5, m%4 ;out0
+ psubsw m%4, m%5 ;out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+ WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ;low: in2 high: in3
+ punpckldq m0, m2 ;low: in0 high: in1
+ punpckldq m2, m3, m4 ;low: in4 high: in5
+ punpckhdq m3, m4 ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_XMM ssse3
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(idct_8x4_internal_8bpc).main
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ call .main
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+ mova m4, [o(pw_2048)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+.pass1_end:
+ INV_4X8
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call .main
+ mova m4, [o(pw_2048)]
+ pxor m5, m5
+ psubw m5, m4
+
+.end:
+ punpcklqdq m4, m5
+
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pxor m5, m5
+ mova [coeffq+16*0], m5
+ mova [coeffq+16*1], m5
+ mova [coeffq+16*2], m5
+ mova [coeffq+16*3], m5
+
+.end3:
+ WRITE_4X8 0, 1, 2, 3
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m3, m0 ;unpacked in7 in0
+ punpckhwd m5, m2, m1 ;unpacked in5 in2
+ punpcklwd m1, m2 ;unpacked in3 in4
+ punpcklwd m0, m3 ;unpacked in1 in6
+ ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
+ ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
+ ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
+ ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
+
+ psubsw m3, m4, m1 ;low: t4 high: t5
+ paddsw m4, m1 ;low: t0 high: t1
+ psubsw m2, m5, m0 ;low: t6 high: t7
+ paddsw m5, m0 ;low: t2 high: t3
+
+ shufps m1, m3, m2, q1032
+ punpckhwd m2, m1
+ punpcklwd m3, m1
+ ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
+ ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
+
+ psubsw m1, m4, m5 ;low: t2 high: t3
+ paddsw m4, m5 ;low: out0 high: -out7
+ psubsw m5, m3, m2 ;low: t7 high: t6
+ paddsw m3, m2 ;low: out6 high: -out1
+ shufps m0, m4, m3, q3210 ;low: out0 high: -out1
+ shufps m3, m4, q3210 ;low: out6 high: -out7
+
+ mova m2, [o(pw_2896_m2896)]
+ mova m7, [o(pw_2896_2896)]
+ shufps m4, m1, m5, q1032 ;low: t3 high: t7
+ shufps m1, m5, q3210 ;low: t2 high: t6
+ punpcklwd m5, m1, m4
+ punpckhwd m1, m4
+ pmaddwd m4, m2, m1 ;-out5
+ pmaddwd m2, m5 ; out4
+ pmaddwd m1, m7 ; out2
+ pmaddwd m5, m7 ;-out3
+ REPX {paddd x, m6}, m4, m2, m1, m5
+ REPX {psrad x, 12}, m4, m2, m1, m5
+ packssdw m1, m5 ;low: out2 high: -out3
+ packssdw m2, m4 ;low: out4 high: -out5
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+ punpcklwd m4, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m5, m1, m0
+ punpckhwd m1, m0
+ punpckldq m2, m3, m1 ;low: in4 high: in5
+ punpckhdq m3, m1 ;low: in6 high: in7
+ punpckldq m0, m4, m5 ;low: in0 high: in1
+ punpckhdq m1, m4, m5 ;low: in2 high: in3
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m5, [o(pw_2048)]
+ pxor m4, m4
+ psubw m4, m5
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+
+%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3]
+ movq m%3, [dstq ]
+ movq m%4, [dstq+strideq]
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ movq [dstq ], m%3
+ punpckhqdq m%3, m%3
+ movq [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3]
+ WRITE_8X2 %1, %2, %5, %6, %7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X2 %3, %4, %5, %6, %7
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ mova m2, [o(pw_2048)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_8x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ call m(idct_4x8_internal_8bpc).main
+
+ mova m4, [o(deint_shuf1)]
+ mova m5, [o(deint_shuf2)]
+ pshufb m0, m4
+ pshufb m1, m5
+ pshufb m2, m4
+ pshufb m3, m5
+ punpckhdq m4, m0, m1
+ punpckldq m0, m1
+ punpckhdq m5, m2, m3
+ punpckldq m2, m3
+ punpckhqdq m1, m0, m2 ;in1
+ punpcklqdq m0, m2 ;in0
+ punpckhqdq m3, m4, m5 ;in3
+ punpcklqdq m2 ,m4, m5 ;in2
+ jmp tx2q
+
+.pass2:
+ call .main
+ jmp m(iadst_8x4_internal_8bpc).end
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ pxor m5, m5
+ psubsw m3, m5, m1
+ psubsw m5, m4
+ punpckhdq m4, m5, m3
+ punpckldq m5, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m5 ;in1
+ punpcklwd m0, m5 ;in0
+ punpcklwd m2, m3, m4 ;in2
+ punpckhwd m3, m4 ;in3
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ mova m4, [o(pw_2048)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+
+.end2:
+ pxor m6, m6
+ mova [coeffq+16*0], m6
+ mova [coeffq+16*1], m6
+ mova [coeffq+16*2], m6
+ mova [coeffq+16*3], m6
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 4, 5, 6
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ punpckhwd m6, m0, m2 ;unpacked in0 in2
+ punpcklwd m0, m2 ;unpacked in0 in2
+ punpckhwd m7, m1, m3 ;unpacked in1 in3
+ punpcklwd m1, m3 ;unpacked in1 in3
+
+ mova m2, [o(pw_3344_m3344)]
+ mova m4, [o(pw_0_3344)]
+ pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
+ pmaddwd m5, m4, m7 ;3344 * in3
+ pmaddwd m2, m0
+ pmaddwd m4, m1
+ paddd m3, m5
+ paddd m2, m4
+ mova m4, [o(pd_2048)]
+ paddd m3, m4 ;t2 + 2048
+ paddd m2, m4
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+
+ pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m3, m4 ;t0 + t3
+
+ pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m4, [o(pd_2048)]
+ paddd m0, m4
+ paddd m4, m3 ;t0 + t3 + 2048
+ paddd m5, m0 ;t1 + t3 + 2048
+ paddd m3, m0
+ paddd m3, m1 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m3, 12 ;out3
+ packssdw m0, m4, m5 ;low: out0 high: out1
+
+ pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m1, m4 ;t0 + t3
+ pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+
+ mova m4, [o(pd_2048)]
+ paddd m6, m4
+ paddd m4, m1 ;t0 + t3 + 2048
+ paddd m5, m6 ;t1 + t3 + 2048
+ paddd m1, m6
+ paddd m1, m7 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m1, 12 ;out3
+ packssdw m3, m1 ;out3
+ packssdw m4, m5 ;low: out0 high: out1
+
+ punpckhqdq m1, m0, m4 ;out1
+ punpcklqdq m0, m4 ;out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m5, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+
+ pxor m0, m0
+ psubsw m4, m0, m2
+ psubsw m0, m5
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ punpckhwd m1, m0, m3 ;in1
+ punpcklwd m0, m3 ;in0
+ punpckhwd m3, m2, m4 ;in3
+ punpcklwd m2, m4 ;in2
+ jmp tx2q
+
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m4, m0
+ mova m5, m1
+ mova m0, m3
+ mova m1, m2
+ mova m2, m5
+ mova m3, m4
+ jmp m(iadst_8x4_internal_8bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m2, m2
+ paddsw m3, m3
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m5, m4, m1
+ punpckldq m4, m1
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m4 ;in1
+ punpcklwd m0, m4 ;in0
+ punpcklwd m2, m3, m5 ;in2
+ punpckhwd m3, m5 ;in3
+ jmp tx2q
+
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 3
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+.end:
+ mov r3d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
+.loop:
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .loop
+ jmp tx2q
+.end3:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [%1+%2*0]
+ pmulhrsw m1, m7, [%1+%2*1]
+ pmulhrsw m2, m7, [%1+%2*2]
+ pmulhrsw m3, m7, [%1+%2*3]
+ pmulhrsw m4, m7, [%1+%2*4]
+ pmulhrsw m5, m7, [%1+%2*5]
+ pmulhrsw m6, m7, [%1+%2*6]
+ pmulhrsw m7, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
+ ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+ psubsw m%2, m%4, m%5 ;t6a
+ paddsw m%4, m%5 ;t7
+ psubsw m%5, m%1, m%3 ;t5a
+ paddsw m%1, m%3 ;t4
+ ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.pass1_end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+
+cglobal_label .pass1_end3
+ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
+ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
+ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
+ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
+ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
+ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
+ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
+ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
+ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
+ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
+ mova [rsp+gprsize+16*2], m6
+ mova m6, [rsp+gprsize+16*1]
+ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
+ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
+ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
+ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
+ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
+ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
+
+ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
+ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
+ mova [rsp+gprsize+16*0], m2
+ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
+ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
+ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
+ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
+ mova m7, [rsp+gprsize+16*2]
+ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
+ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
+ mova m7, [rsp+gprsize+16*0]
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*0], m7
+
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
+ jmp tx2q
+
+.end4:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m1
+ mova m7, [o(pd_2048)]
+ IDCT4_1D 0, 2, 4, 6, 1, 3, 7
+ mova m3, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m2
+ mova m2, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m4
+ mova m4, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m6
+ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
+ mova m6, [rsp+gprsize*2+16*0]
+ psubsw m7, m0, m4 ;out7
+ paddsw m0, m4 ;out0
+ mova [rsp+gprsize*2+16*0], m7
+ mova m1, [rsp+gprsize*2+16*2]
+ psubsw m4, m6, m3 ;out4
+ paddsw m3, m6 ;out3
+ mova m7, [rsp+gprsize*2+16*1]
+ psubsw m6, m1, m5 ;out6
+ paddsw m1, m5 ;out1
+ psubsw m5, m7, m2 ;out5
+ paddsw m2, m7 ;out2
+ ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+ call .main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end2
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+ call .main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m4
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
+ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
+ paddsw m3, m2, m6 ;t2
+ psubsw m2, m6 ;t6
+ paddsw m4, m5, m1 ;t3
+ psubsw m5, m1 ;t7
+ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
+
+ mova m6, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m2
+ mova m5, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m3
+ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
+ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
+ psubsw m2, m0, m6 ;t4
+ paddsw m0, m6 ;t0
+ paddsw m3, m5, m1 ;t1
+ psubsw m5, m1 ;t5
+ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
+
+ mova m7, [rsp+gprsize*2+16*0]
+ paddsw m1, m3, m4 ;-out7
+ psubsw m3, m4 ;t3
+ mova [rsp+gprsize*2+16*0], m1
+ psubsw m4, m0, m7 ;t2
+ paddsw m0, m7 ;out0
+ mova m6, [rsp+gprsize*2+16*2]
+ mova m7, [rsp+gprsize*2+16*1]
+ paddsw m1, m5, m6 ;-out1
+ psubsw m5, m6 ;t6
+ paddsw m6, m2, m7 ;out6
+ psubsw m2, m7 ;t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [rsp+gprsize*2+16*1], m1
+ mova [rsp+gprsize*2+16*2], m6
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m7, m5, m2
+ punpcklwd m5, m2
+ mova m2, [o(pw_2896_2896)]
+ mova m6, [o(pd_2048)]
+ pmaddwd m3, m2, m7
+ pmaddwd m2, m5
+ paddd m3, m6
+ paddd m2, m6
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+ mova m3, [o(pw_2896_m2896)]
+ pmaddwd m7, m3
+ pmaddwd m5, m3
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m5, m7 ;-out5
+ mova m3, [o(pw_2896_2896)]
+ pmaddwd m7, m3, m1
+ pmaddwd m3, m4
+ paddd m7, m6
+ paddd m3, m6
+ psrad m7, 12
+ psrad m3, 12
+ packssdw m3, m7 ;-out3
+ mova m7, [o(pw_2896_m2896)]
+ pmaddwd m1, m7
+ pmaddwd m4, m7
+ paddd m1, m6
+ paddd m4, m6
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m4, m1 ;-out5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova m6, [rsp+gprsize*2+16*2]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ paddsw m7, m4, m3 ;t2 + t3
+ psubsw m4, m3 ;t2 - t3
+ paddsw m3, m5, m2 ;t6 + t7
+ psubsw m5, m2 ;t6 - t7
+ mova m2, [o(pw_2896x8)]
+ pmulhrsw m4, m2 ;out4
+ pmulhrsw m5, m2 ;-out5
+ pmulhrsw m7, m2 ;-out3
+ pmulhrsw m2, m3 ;out2
+ mova m3, m7
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_m16384)]
+
+.pass1_end1:
+ pmulhrsw m1, m7
+ mova [rsp+gprsize+16*1], m1
+ mova m1, m6
+ mova m6, m2
+ pmulhrsw m2, m5, m7
+ mova m5, m6
+ mova m6, m4
+ pmulhrsw m4, m3, m7
+ mova m3, m6
+ mova m6, m0
+ mova m0, m7
+ pxor m7, m7
+ psubw m7, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*2], m2
+ mova m2, m0
+ pxor m0, m0
+ psubw m0, m7
+ mova m7, m2
+ pmulhrsw m1, m0
+ pmulhrsw m2, m5, m0
+ mova [rsp+gprsize+16*1], m1
+ mova m5, m4
+ mova m1, m6
+ pmulhrsw m4, m3, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ mova m3, m5
+ mova [rsp+gprsize+16*0], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.end:
+ pmulhrsw m7, [o(pw_4096)]
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_4096)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, [o(pw_16384)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+.end:
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ RET
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_4x8_internal_8bpc).pass1)]
+
+.pass1:
+ mova m0, [coeffq+16*1]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ push tx2q
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
+ jmp r3
+
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ pop tx2q
+
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*7], m7
+ jmp tx2q
+
+.pass2:
+ call m(idct_16x4_internal_8bpc).main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+
+.end1:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 3, 2
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 3, 2
+
+.end2:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
+ punpckhqdq m4, m5 ;low: out8 high: out10
+ punpcklqdq m5, m7, m2 ;low: out4 high: out6
+ punpckhqdq m2, m7 ;low: -out9 high: -out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpckhqdq m1, m6, m0 ;low: -out13 high: -out15
+ punpcklqdq m0, m6 ;low: out0 high: out2
+ punpckhqdq m6, m3, m2 ;low: out12 high: out14
+ punpcklqdq m2, m3 ;low: -out1 high: -out3
+
+ mova m7, [o(pw_2048)]
+
+.end1:
+ REPX {pmulhrsw x, m7}, m0, m5, m4, m6
+ pxor m3, m3
+ psubw m3, m7
+ mova m7, [coeffq+16*4]
+ REPX {pmulhrsw x, m3}, m2, m7, m1
+ pmulhrsw m3, [coeffq+16*5]
+ mova [coeffq+16*7], m5
+
+ punpckhqdq m5, m4, m7 ;low: out10 high: out11
+ punpcklqdq m4, m7 ;low: out8 high: out9
+ punpckhqdq m7, m6, m1 ;low: out14 high: out15
+ punpcklqdq m6, m1 ;low: out12 high: out13
+ punpckhqdq m1, m0, m2 ;low: out2 high: out3
+ punpcklqdq m0, m2 ;low: out0 high: out1
+ mova [coeffq+16*4], m4
+ mova m4, [coeffq+16*7]
+ punpcklqdq m2, m4, m3 ;low: out4 high: out5
+ punpckhqdq m4, m3 ;low: out6 high: out7
+ mova m3, m4
+
+.end2:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 2, 3
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 2, 3
+
+.end3:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpckhqdq m6, m5, m4 ;low: out5 high: out7
+ punpcklqdq m4, m5 ;low: -out8 high: -out10
+ punpckhqdq m5, m7, m2 ;low: -out4 high: -out6
+ punpcklqdq m2, m7 ;low: out9 high: out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpcklqdq m1, m6, m0 ;low: out13 high: out15
+ punpckhqdq m0, m6 ;low: -out0 high: -out2
+ punpcklqdq m6, m3, m2 ;low: -out12 high: -out14
+ punpckhqdq m2, m3 ;low: out1 high: out3
+
+ mova m7, [o(pw_m2048)]
+ jmp m(iadst_4x16_internal_8bpc).end1
+
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*1]
+ mova m6, [o(pw_1697x8)]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ pcmpeqw m7, m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_2)]
+.pass1:
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+ pavgw m4, m0
+ pcmpeqw m0, m7
+ pavgw m5, m1
+ pcmpeqw m1, m7
+ pandn m0, m4
+ pmulhrsw m4, m6, m2
+ pandn m1, m5
+ pmulhrsw m5, m6, m3
+ pavgw m4, m2
+ pcmpeqw m2, m7
+ pavgw m5, m3
+ pcmpeqw m3, m7
+ pandn m2, m4
+ pandn m3, m5
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(.pass1_end)]
+ jmp .pass1
+.pass1_end:
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ jmp r3
+.pass2:
+ mova m7, [o(pw_1697x16)]
+ mova [coeffq+16*6], m6
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [coeffq+16*7]
+ IDTX16 6, 7, 7
+ mova [coeffq+16*7], m6
+ mova m6, [coeffq+16*6]
+ pmulhrsw m7, m6, [o(pw_1697x16)]
+ paddsw m6, m6
+ paddsw m6, m7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ mov r2d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
+.dconly:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq]
+ mova m3, [dstq+strideq]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq], m1
+ mova [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ dec r2d
+ jg .dconly_loop
+ jmp tx2q
+.end:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_7ROWS 2 ;src, stride
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+%endmacro
+
+%macro SAVE_7ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+%endmacro
+
+%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
+ punpckhwd m%5, m%4, m%1 ;packed in13 in3
+ punpcklwd m%1, m%4 ;packed in1 in15
+ punpcklwd m%4, m%3, m%2 ;packed in9 in7
+ punpckhwd m%2, m%3 ;packed in5 in11
+ mova m%7, [o(pd_2048)]
+ ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a
+ ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a
+ ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
+ psubsw m%6, m%1, m%4 ;low: t9 high: t14
+ paddsw m%1, m%4 ;low: t8 high: t15
+ psubsw m%4, m%5, m%2 ;low: t10 high: t13
+ paddsw m%5, m%2 ;low: t11 high: t12
+ mova m%2, [o(deint_shuf2)]
+ pshufb m%6, m%2
+ pshufb m%4, m%2
+ ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a
+ psubsw m%3, m%1, m%5 ;low: t11a high: t12a
+ paddsw m%1, m%5 ;low: t8a high: t15a
+ psubsw m%5, m%6, m%4 ;low: t10 high: t13
+ paddsw m%6, m%4 ;low: t9 high: t14
+ pshufb m%3, m%2
+ pshufb m%5, m%2
+ ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11
+ ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a
+ packssdw m%2, m%4 ;low: t11 high: t10a
+ packssdw m%3, m%5 ;low: t12 high: t13a
+ punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
+ punpcklqdq m%1, m%6 ;low: t8a high: t9
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+
+.pass1_end:
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpcklwd m2, m1, m3 ;packed out3, out7
+ punpckhwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpcklwd m6, m5, m7 ;packed out11, out15
+ punpckhwd m5, m7 ;packed out10, out14
+
+.pass1_end2:
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*6]
+ mova [coeffq+16*6], m7
+
+.pass1_end3:
+ punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high
+ punpcklwd m3, m6 ;packed 9, 10, 13, 15 low
+ punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high
+ punpcklwd m4, m5 ;packed 8, 10, 12, 14 low
+ punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1)
+ punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0)
+ punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3)
+ punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2)
+ mova [coeffq+16*7], m3
+ mova m3, [coeffq+16*6]
+ punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high
+ punpcklwd m3, m2 ;packed 1, 3, 5, 7 low
+ punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high
+ punpcklwd m0, m1 ;packed 0, 2, 4, 6 low
+ punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1)
+ punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0)
+ punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3)
+ punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2)
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
+
+.pass2_end:
+ mova [coeffq+16*4], m4
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ lea r3, [dstq+8]
+ call tx2q
+
+ add coeffq, 16*4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mov dstq, r3
+ jmp tx2q
+
+ALIGN function_align
+cglobal_label .main
+ punpckhqdq m7, m0, m1 ;low:in1 high:in3
+ punpcklqdq m0, m1
+ punpcklqdq m1, m2, m3
+ punpckhqdq m3, m2 ;low:in7 high:in5
+ mova [coeffq+16*4], m7
+ mova [coeffq+16*5], m3
+ mova m7, [coeffq+16*7]
+ punpcklqdq m2, m4, m5
+ punpckhqdq m4, m5 ;low:in9 high:in11
+ punpcklqdq m3, m6, m7
+ punpckhqdq m7, m6 ;low:in15 high:in13
+ mova [coeffq+16*6], m4
+ IDCT8_1D_PACKED
+ mova m6, [coeffq+16*4]
+ mova m4, [coeffq+16*5]
+ mova m5, [coeffq+16*6]
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*6], m3
+
+ IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
+
+ mova m1, [coeffq+16*4]
+ psubsw m3, m0, m7 ;low:out15 high:out14
+ paddsw m0, m7 ;low:out0 high:out1
+ psubsw m7, m1, m5 ;low:out12 high:out13
+ paddsw m1, m5 ;low:out3 high:out2
+ mova [coeffq+16*7], m3
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*6]
+ psubsw m5, m2, m4 ;low:out11 high:out10
+ paddsw m2, m4 ;low:out4 high:out5
+ psubsw m4, m3, m6 ;low:out8 high:out9
+ paddsw m3, m6 ;low:out7 high:out6
+ mova m6, m7
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+ call .main_pass1_end
+
+ punpckhwd m6, m7, m0 ;packed -out11, -out15
+ punpcklwd m0, m7 ;packed out0, out4
+ punpcklwd m7, m3, m4 ;packed -out3, -out7
+ punpckhwd m4, m3 ;packed out8, out12
+ mova m1, [coeffq+16*6]
+ punpcklwd m3, m1, m5 ;packed -out1, -out5
+ punpckhwd m5, m1 ;packed out10, out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpckhwd m3, m2, m1 ;packed -out9, -out13
+ punpcklwd m1, m2 ;packed out2, out6
+
+ mova m7, [o(pw_16384)]
+
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m0, m1, m4, m5
+ pxor m2, m2
+ psubw m2, m7
+ mova m7, [coeffq+16*6]
+ REPX {pmulhrsw x, m2}, m7, m3, m6
+ pmulhrsw m2, [coeffq+16*7]
+ mova [coeffq+16*6], m7
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+ALIGN function_align
+cglobal_label .main
+ mova [coeffq+16*6], m0
+ pshufd m0, m1, q1032
+ pshufd m2, m2, q1032
+ punpckhwd m1, m6, m0 ;packed in13, in2
+ punpcklwd m0, m6 ;packed in3, in12
+ punpckhwd m7, m5, m2 ;packed in11, in4
+ punpcklwd m2, m5 ;packed in5, in10
+ mova m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
+ ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
+ ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
+ ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
+ psubsw m5, m1, m2 ;low:t10a high:t11a
+ paddsw m1, m2 ;low:t2a high:t3a
+ psubsw m2, m7, m0 ;low:t12a high:t13a
+ paddsw m7, m0 ;low:t4a high:t5a
+ punpcklqdq m0, m5
+ punpckhwd m0, m5 ;packed t10a, t11a
+ punpcklqdq m5, m2
+ punpckhwd m2, m5 ;packed t13a, t12a
+ ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
+ ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m7
+ mova m1, [coeffq+16*6]
+ mova m7, [coeffq+16*7]
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ punpckhwd m5, m7, m1 ;packed in15, in0
+ punpcklwd m1, m7 ;packed in1, in14
+ punpckhwd m7, m4, m3 ;packed in9, in6
+ punpcklwd m3, m4 ;packed in7, in8
+ ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
+ ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
+ ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
+ ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
+ psubsw m4, m5, m3 ;low:t8a high:t9a
+ paddsw m5, m3 ;low:t0a high:t1a
+ psubsw m3, m7, m1 ;low:t14a high:t15a
+ paddsw m7, m1 ;low:t6a high:t7a
+ punpcklqdq m1, m4
+ punpckhwd m1, m4 ;packed t8a, t9a
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t15a, t14a
+ ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
+ ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
+ paddsw m4, m1, m2 ;low:t12a high:t13a
+ psubsw m1, m2 ;low:t8a high:t9a
+ psubsw m2, m0, m3 ;low:t14a high:t15a
+ paddsw m0, m3 ;low:t10a high:t11a
+ punpcklqdq m3, m1
+ punpckhwd m3, m1 ;packed t12a, t13a
+ punpcklqdq m1, m2
+ punpckhwd m2, m1 ;packed t15a, t14a
+ ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
+ ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
+ psubsw m1, m3, m2 ;low:t14a high:t15a
+ paddsw m3, m2 ;low:out2 high:-out13
+ psubsw m2, m4, m0 ;low:t10 high:t11
+ paddsw m0, m4 ;low:-out1 high:out14
+ mova [coeffq+16*6], m0
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ psubsw m4, m5, m3 ;low:t4 high:t5
+ paddsw m5, m3 ;low:t0 high:t1
+ psubsw m3, m0, m7 ;low:t6 high:t7
+ paddsw m0, m7 ;low:t2 high:t3
+ punpcklqdq m7, m4
+ punpckhwd m7, m4 ;packed t4, t5
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t7, t6
+ ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
+ ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
+ psubsw m4, m5, m0 ;low:t2a high:t3a
+ paddsw m0, m5 ;low:out0 high:-out15
+ psubsw m5, m7, m3 ;low:t6 high:t7
+ paddsw m3, m7 ;low:-out3 high:out12
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m7, [o(deint_shuf1)]
+ mova [coeffq+16*4], m0
+ mova [coeffq+16*5], m3
+ mova m0, [o(pw_2896_m2896)]
+ mova m3, [o(pw_2896_2896)]
+ pshufb m1, m7 ;t14a t15a
+ pshufb m2, m7 ;t10 t11
+ pshufb m4, m7 ;t2a t3a
+ pshufb m5, m7 ;t6 t7
+ pmaddwd m7, m0, m2
+ pmaddwd m2, m3
+ paddd m7, m6
+ paddd m2, m6
+ psrad m7, 12
+ psrad m2, 12
+ packssdw m2, m7 ;low:out6 high:-out9
+ pmaddwd m7, m0, m4
+ pmaddwd m4, m3
+ paddd m7, m6
+ paddd m4, m6
+ psrad m7, 12
+ psrad m4, 12
+ packssdw m4, m7 ;low:-out7 high:out8
+ pmaddwd m7, m3, m5
+ pmaddwd m5, m0
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m7, m5 ;low:out4 high:-out11
+ pmaddwd m5, m3, m1
+ pmaddwd m1, m0
+ paddd m5, m6
+ paddd m1, m6
+ psrad m5, 12
+ psrad m1, 12
+ packssdw m5, m1 ;low:-out5 high:out10
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ punpckhqdq m6, m2, m1 ;low:t11 high:t15a
+ punpcklqdq m2, m1 ;low:t10 high:t14a
+ psubsw m1, m2, m6
+ paddsw m2, m6
+ punpckhqdq m6, m4, m5 ;low:t3a high:t7
+ punpcklqdq m4, m5 ;low:t2a high:t6
+ psubsw m5, m4, m6
+ paddsw m4, m6
+ pmulhrsw m1, m7 ;low:-out9 high:out10
+ pmulhrsw m2, m7 ;low:out6 high:-out5
+ pmulhrsw m5, m7 ;low:out8 high:-out11
+ pmulhrsw m4, m7 ;low:-out7 high:out4
+ punpckhqdq m7, m4, m5 ;low:out4 high:-out11
+ punpcklqdq m4, m5 ;low:-out7 high:out8
+ punpckhqdq m5, m2, m1 ;low:-out5 high:out10
+ punpcklqdq m2, m1 ;low:out6 high:-out9
+ ret
+
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass1_end
+
+ punpcklwd m6, m7, m0 ;packed out11, out15
+ punpckhwd m0, m7 ;packed -out0, -out4
+ punpckhwd m7, m3, m4 ;packed out3, out7
+ punpcklwd m4, m3 ;packed -out8, -out12
+ mova m1, [coeffq+16*6]
+ punpckhwd m3, m1, m5 ;packed out1, out5
+ punpcklwd m5, m1 ;packed -out10, -out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpcklwd m3, m2, m1 ;packed out9, out13
+ punpckhwd m1, m2 ;packed -out2, -out6
+
+ mova m7, [o(pw_m16384)]
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m1, [coeffq+16*6]
+ mova m0, [coeffq+16*5]
+ mova m2, [coeffq+16*7]
+ mova m6, [o(pw_1697x16)]
+ mova m7, [o(pw_16384)]
+ pmulhrsw m4, m6, m1
+ pmulhrsw m3, m6, m0
+ pmulhrsw m5, m6, m2
+ pmulhrsw m4, m7
+ pmulhrsw m3, m7
+ pmulhrsw m5, m7
+ paddsw m1, m4
+ paddsw m0, m3
+ paddsw m5, m2
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova [coeffq+16*6], m1
+ mova [coeffq+16*5], m0
+ mova [coeffq+16*7], m5
+ pmulhrsw m0, m6, m2
+ pmulhrsw m1, m6, m3
+ pmulhrsw m5, m6, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ pmulhrsw m5, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m5
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m6, m1
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ paddsw m0, m5
+ paddsw m1, m6
+ mova m6, [coeffq+16*6]
+ mova m5, [coeffq+16*5]
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpckhwd m2, m1, m3 ;packed out3, out7
+ punpcklwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpckhwd m6, m5, m7 ;packed out11, out15
+ punpcklwd m5, m7 ;packed out10, out14
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+%macro SAVE_8ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+ mova [%1+%2*7], m7
+%endmacro
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ psrlw m2, 3 ; pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mov r3d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_8x8_internal_8bpc).pass1)]
+
+.pass1:
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov [rsp+gprsize+16*11], tx2q
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, [rsp+gprsize+16*11]
+ jmp r3
+
+.pass2:
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [coeffq+16*2 ], m1
+ mova [coeffq+16*6 ], m3
+ mova [coeffq+16*10], m5
+ mova [coeffq+16*14], m7
+ mova m1, m2
+ mova m2, m4
+ mova m3, m6
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+
+.pass2_main:
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+16*2 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*10]
+ mova m3, [coeffq+16*14]
+ mova m4, [coeffq+16*3 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*15]
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
+ lea r3, [dstq+strideq*8]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, r3
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m6
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*2], m5
+ IDTX16 6, 5, 7
+ mova m5, [rsp+gprsize+16*0]
+ IDTX16 5, 7, 7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize+16*2]
+ mova [rsp+gprsize+16*0], m5
+ mova [rsp+gprsize+16*1], m6
+ mova [rsp+gprsize+16*2], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ call .main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+ mova [rsp+gprsize*2+32*5], m5
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a
+ ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a
+ psubsw m2, m0, m4 ;t9
+ paddsw m0, m4 ;t8
+ psubsw m4, m7, m3 ;t14
+ paddsw m7, m3 ;t15
+ ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a
+ mova m3, [rsp+gprsize*2+16*1]
+ mova m5, [rsp+gprsize*2+32*5]
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+32*5], m4
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m7
+ ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a
+ ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a
+ psubsw m4, m2, m3 ;t10
+ paddsw m2, m3 ;t11
+ psubsw m3, m1, m5 ;t13
+ paddsw m1, m5 ;t12
+ ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a
+ mova m7, [rsp+gprsize*2+32*5]
+ psubsw m6, m0, m2 ;t11a
+ paddsw m0, m2 ;t8a
+ paddsw m2, m7, m3 ;t9
+ psubsw m7, m3 ;t10
+ mova m5, [rsp+gprsize*2+16*0]
+ psubsw m3, m5, m0 ;out8
+ paddsw m0, m5 ;out7
+ mova [rsp+gprsize*2+32*5], m0
+ mova m5, [rsp+gprsize*2+16*9]
+ psubsw m0, m5, m2 ;out9
+ paddsw m2, m5 ;out6
+ mova [rsp+gprsize*2+16*0], m0
+ mova [rsp+gprsize*2+16*9], m2
+ mova m0, [rsp+gprsize*2+16*1]
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*1], m3
+ psubsw m5, m0, m4 ;t13
+ paddsw m0, m4 ;t14
+ mova m3, [o(pd_2048)]
+ psubsw m4, m2, m1 ;t12a
+ paddsw m1, m2 ;t15a
+ mova [rsp+gprsize*2+16*2], m1
+ ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a
+ ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12
+ mova m3, [rsp+gprsize*2+16*8]
+ psubsw m2, m3, m5 ;out10
+ paddsw m3, m5 ;out5
+ mova m5, [rsp+gprsize*2+16*7]
+ mova [rsp+gprsize*2+16*8], m3
+ psubsw m3, m5, m4 ;out11
+ paddsw m5, m4 ;out4
+ mova m4, [rsp+gprsize*2+16*6]
+ mova [rsp+gprsize*2+16*7], m5
+ paddsw m5, m4, m6 ;out3
+ psubsw m4, m6 ;out12
+ mova m6, [rsp+gprsize*2+16*5]
+ mova [rsp+gprsize*2+16*6], m5
+ psubsw m5, m6, m7 ;out13
+ paddsw m6, m7 ;out2
+ mova m7, [rsp+gprsize*2+16*4]
+ mova [rsp+gprsize*2+16*5], m6
+ psubsw m6, m7, m0 ;out14
+ paddsw m7, m0 ;out1
+ mova m1, [rsp+gprsize*2+16*2]
+ mova m0, [rsp+gprsize*2+16*3]
+ mova [rsp+gprsize*2+16*4], m7
+ psubsw m7, m0, m1 ;out15
+ paddsw m0, m1 ;out0
+ mova [rsp+gprsize*2+16*3], m0
+ mova m1, [rsp+gprsize*2+16*0]
+ mova m0, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*0], m7
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call .main
+ call .main_pass1_end
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m1
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2
+ ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10
+ psubsw m1, m0, m4 ;t10a
+ paddsw m0, m4 ;t2a
+ psubsw m4, m7, m3 ;t11a
+ paddsw m3, m7 ;t3a
+ ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10
+ mova m2, [rsp+gprsize*2+16*0] ;in3
+ mova m7, [rsp+gprsize*2+16*1] ;in4
+ mova [rsp+gprsize*2+16*0], m1 ;t11
+ mova [rsp+gprsize*2+16*1], m4 ;t10
+ mova m1, [rsp+gprsize*2+16*2] ;in12
+ mova [rsp+gprsize*2+16*2], m0 ;t2a
+ ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4
+ ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12
+ psubsw m0, m7, m1 ;t12a
+ paddsw m1, m7 ;t4a
+ psubsw m4, m5, m2 ;t13a
+ paddsw m5, m2 ;t5a
+ ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13
+ mova m2, [rsp+gprsize*2+16*8] ;in1
+ mova m7, [rsp+gprsize*2+16*9] ;in14
+ mova [rsp+gprsize*2+16*8], m4 ;t12
+ mova [rsp+gprsize*2+16*9], m0 ;t13
+ mova m4, [rsp+gprsize*2+16*4] ;in9
+ mova m0, [rsp+gprsize*2+16*5] ;in6
+ mova [rsp+gprsize*2+16*4], m1 ;t4a
+ mova [rsp+gprsize*2+16*5], m5 ;t5a
+ ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14
+ ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6
+ psubsw m1, m0, m7 ;t14a
+ paddsw m0, m7 ;t6a
+ psubsw m5, m4, m2 ;t15a
+ paddsw m4, m2 ;t7a
+ ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*2] ;t2a
+ mova [rsp+gprsize*2+16*2], m5 ;t14
+ psubsw m7, m2, m0 ;t6
+ paddsw m2, m0 ;t2
+ psubsw m0, m3, m4 ;t7
+ paddsw m3, m4 ;t3
+ ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a
+ mova m4, [rsp+gprsize*2+16*7] ;in0
+ mova m5, [rsp+gprsize*2+32*5] ;in15
+ mova [rsp+gprsize*2+16*7], m3 ;t3
+ mova [rsp+gprsize*2+32*5], m1 ;t15
+ mova m1, [rsp+gprsize*2+16*6] ;in7
+ mova m3, [rsp+gprsize*2+16*3] ;in8
+ mova [rsp+gprsize*2+16*6], m7 ;t7a
+ mova [rsp+gprsize*2+16*3], m0 ;t6a
+ ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0
+ ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8
+ psubsw m0, m4, m3 ;t8a
+ paddsw m4, m3 ;t0a
+ psubsw m3, m5, m1 ;t9a
+ paddsw m5, m1 ;t1a
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8
+ mova m1, [rsp+gprsize*2+16*4] ;t4a
+ mova m7, [rsp+gprsize*2+16*5] ;t5a
+ mova [rsp+gprsize*2+16*4], m3 ;t8
+ mova [rsp+gprsize*2+16*5], m0 ;t9
+ psubsw m0, m4, m1 ;t4
+ paddsw m4, m1 ;t0
+ psubsw m3, m5, m7 ;t5
+ paddsw m5, m7 ;t1
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a
+ mova m7, [rsp+gprsize*2+16*3] ;t6a
+ psubsw m1, m4, m2 ;t2a
+ paddsw m4, m2 ;out0
+ mova [rsp+gprsize*2+16*3], m4 ;out0
+ mova m4, [rsp+gprsize*2+16*6] ;t7a
+ psubsw m2, m3, m7 ;t6
+ paddsw m3, m7 ;-out3
+ mova [rsp+gprsize*2+16*6], m3 ;-out3
+ psubsw m3, m0, m4 ;t7
+ paddsw m0, m4 ;out12
+ mova [rsp+gprsize*2+16*12], m3
+ mova m3, [rsp+gprsize*2+16*7] ;t3
+ mova [rsp+gprsize*2+16* 7], m2 ;out4
+ psubsw m2, m5, m3 ;t3a
+ paddsw m5, m3 ;-out15
+ mova [rsp+gprsize*2+16*11], m2
+ mova m2, [rsp+gprsize*2+32*5] ;t15
+ mova [rsp+gprsize*2+16*10], m1 ;-out7
+ mova m1, [rsp+gprsize*2+16*0] ;t11
+ mova [rsp+gprsize*2+16*0 ], m5 ;-out15
+ mova m3, [rsp+gprsize*2+16*1] ;t10
+ mova [rsp+gprsize*2+16*1 ], m4 ;-out11
+ mova m4, [rsp+gprsize*2+16*2] ;t14
+ mova [rsp+gprsize*2+16*2 ], m0 ;out12
+ psubsw m0, m3, m4 ;t14a
+ paddsw m3, m4 ;t10a
+ psubsw m5, m1, m2 ;t15a
+ paddsw m1, m2 ;t11a
+ ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*4] ;t8
+ mova m4, [rsp+gprsize*2+16*5] ;t9
+ mova [rsp+gprsize*2+16*4], m3 ;t10a
+ mova [rsp+gprsize*2+16*5], m1 ;t11a
+ mova m3, [rsp+gprsize*2+16*8] ;t12
+ mova m1, [rsp+gprsize*2+16*9] ;t13
+ mova [rsp+gprsize*2+16*8], m5 ;t14
+ mova [rsp+gprsize*2+16*9], m0 ;t15
+ psubsw m5, m2, m3 ;t12a
+ paddsw m2, m3 ;t8a
+ psubsw m0, m4, m1 ;t13a
+ paddsw m4, m1 ;t9a
+ ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12
+ mova m6, [rsp+gprsize*2+16*4] ;t10a
+ mova m1, [rsp+gprsize*2+16*5] ;t11a
+ psubsw m3, m2, m6 ;t10
+ paddsw m2, m6 ;-out1
+ paddsw m6, m4, m1 ;out14
+ psubsw m4, m1 ;t11
+ mova [rsp+gprsize*2+16*14], m4
+ mova [rsp+gprsize*2+16* 4], m2 ;-out1
+ mova m4, [rsp+gprsize*2+16*8] ;t14
+ mova m2, [rsp+gprsize*2+16*9] ;t15
+ mova [rsp+gprsize*2+16* 9], m3 ;out6
+ psubsw m3, m0, m4 ;t14a
+ paddsw m0, m4 ;out2
+ psubsw m4, m5, m2 ;t15a
+ paddsw m5, m2 ;-out13
+ mova [rsp+gprsize*2+16* 5], m0 ;out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m0, [rsp+gprsize*2+16*14]
+ mova [rsp+gprsize*2+16*14], m5
+ mova [rsp+gprsize*2+16*15], m6
+ mova m5, [o(pw_2896_2896)]
+ mova m6, [o(pw_2896_m2896)]
+ mova m7, [o(pd_2048)]
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m4, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m4, m2, m1, m3
+ REPX {psrad x, 12}, m4, m1, m2, m3
+ packssdw m4, m1 ;-out5
+ packssdw m2, m3 ;out10
+ mova [rsp+gprsize*2+16* 8], m4
+ mova m3, [rsp+gprsize*2+16* 9]
+ punpcklwd m1, m3, m0
+ punpckhwd m3, m0
+ pmaddwd m0, m5, m1
+ pmaddwd m1, m6
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m0, m1, m4, m3
+ REPX {psrad x, 12}, m0, m4, m1, m3
+ packssdw m0, m4 ;out6
+ packssdw m1, m3 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ mova m0, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ punpcklwd m3, m0, m4
+ punpckhwd m0, m4
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ pmaddwd m5, m0
+ pmaddwd m0, m6
+ REPX {paddd x, m7}, m4, m3, m5, m0
+ REPX {psrad x, 12}, m4, m5, m3, m0
+ packssdw m4, m5 ;out4
+ packssdw m3, m0 ;-out11
+ mova [rsp+gprsize*2+16* 7], m4
+ mova m4, [rsp+gprsize*2+16*10]
+ mova m5, [rsp+gprsize*2+16*11]
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ pmaddwd m5, m0, [o(pw_2896_2896)]
+ pmaddwd m0, m6
+ pmaddwd m6, m4
+ pmaddwd m4, [o(pw_2896_2896)]
+ REPX {paddd x, m7}, m5, m0, m6, m4
+ REPX {psrad x, 12}, m0, m6, m5, m4
+ packssdw m0, m6 ;out8
+ packssdw m5, m4 ;-out7
+ mova [rsp+gprsize*2+16*10], m5
+ mova m4, [rsp+gprsize*2+16* 2] ;out12
+ mova m5, [rsp+gprsize*2+16*14] ;-out13
+ mova m6, [rsp+gprsize*2+16*15] ;out14
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ mova m1, [rsp+gprsize*2+16* 9]
+ mova m2, [rsp+gprsize*2+16*14]
+ paddsw m0, m1, m2
+ psubsw m1, m2
+ pmulhrsw m0, m7 ;out6
+ pmulhrsw m1, m7 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ psubsw m2, m3, m4
+ paddsw m3, m4
+ pmulhrsw m2, m7 ;out10
+ pmulhrsw m3, m7 ;-out5
+ mova [rsp+gprsize*2+16* 8], m3
+ mova m3, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ paddsw m0, m3, m4
+ psubsw m3, m4
+ pmulhrsw m0, m7 ;out4
+ pmulhrsw m3, m7 ;-out11
+ mova [rsp+gprsize*2+16* 7], m0
+ mova m0, [rsp+gprsize*2+16*10]
+ paddsw m4, m0, [rsp+gprsize*2+16*11]
+ psubsw m0, [rsp+gprsize*2+16*11]
+ pmulhrsw m4, m7 ;-out7
+ pmulhrsw m0, m7 ;out8
+ mova [rsp+gprsize*2+16*10], m4
+ mova m4, [rsp+gprsize*2+16*2 ] ;out12
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*16
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m0, [o(pw_2896x8)]
+ mova m2, [o(pw_1697x16)]
+ mova m3, [o(pw_16384)]
+ sub coeffq, 8*16
+ REPX {pmulhrsw x, m0}, m4, m5, m6, m7
+ pmulhrsw m1, m2, m4
+ pmulhrsw m1, m3
+ paddsw m1, m4 ; 1
+ pmulhrsw m4, m2, m5
+ pmulhrsw m4, m3
+ paddsw m4, m5 ; 3
+ pmulhrsw m5, m2, m6
+ pmulhrsw m5, m3
+ paddsw m5, m6 ; 5
+ pmulhrsw m6, m2, m7
+ pmulhrsw m6, m3
+ paddsw m7, m6 ; 7
+ pmulhrsw m6, m0, [coeffq+16*6]
+ mova [rsp+gprsize+16*0], m4
+ pmulhrsw m4, m2, m6
+ pmulhrsw m4, m3
+ paddsw m6, m4 ; 6
+ pmulhrsw m4, m0, [coeffq+16*4]
+ mova [rsp+gprsize+16*1], m6
+ pmulhrsw m6, m2, m4
+ pmulhrsw m6, m3
+ paddsw m4, m6 ; 4
+ pmulhrsw m6, m0, [coeffq+16*2]
+ pmulhrsw m0, [coeffq+16*0]
+ pmulhrsw m2, m6
+ pmulhrsw m2, m3
+ paddsw m2, m6 ; 2
+ pmulhrsw m6, m0, [o(pw_1697x16)]
+ pmulhrsw m6, m3
+ mova m3, [rsp+gprsize+16*0]
+ paddsw m0, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ mova [coeffq+16*1], m4
+ mova [coeffq+16*3], m5
+ mova [coeffq+16*5], m6
+ mova [coeffq+16*7], m7
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mova [coeffq-16*7], m0
+ mova [coeffq-16*5], m1
+ mova [coeffq-16*3], m2
+ mova [coeffq-16*1], m3
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 8
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_16x8_internal_8bpc).main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*4 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*12]
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_main
+
+
+%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
+ mova m0, [coeffq+16*1 ]
+ mova m1, [coeffq+16*3 ]
+ mova m2, [coeffq+16*29]
+ mova m3, [coeffq+16*31]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*13]
+ mova m1, [coeffq+16*15]
+ mova m2, [coeffq+16*17]
+ mova m3, [coeffq+16*19]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*5 ]
+ mova m1, [coeffq+16*7 ]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*11]
+ mova m4, [coeffq+16*21]
+ mova m5, [coeffq+16*23]
+ mova m6, [coeffq+16*25]
+ mova m7, [coeffq+16*27]
+%endmacro
+
+%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*2 ]
+ mova m2, [coeffq+16*28]
+ mova m3, [coeffq+16*30]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*12]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*16]
+ mova m3, [coeffq+16*18]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m4, [coeffq+16*20]
+ mova m5, [coeffq+16*22]
+ mova m6, [coeffq+16*24]
+ mova m7, [coeffq+16*26]
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*17, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS coeffq+16* 0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+
+ lea tx2q, [o(.end2)]
+ mov dstq, r3
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*17
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m6, [o(pw_1697x16)]
+ mova m7, [coeffq+32*6]
+ mova m0, [coeffq+32*0]
+ mova m1, [coeffq+32*1]
+ mova m2, [coeffq+32*2]
+ mova m3, [coeffq+32*3]
+ mova m4, [coeffq+32*4]
+ REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
+ mova m5, [coeffq+32*5]
+ mova [rsp+gprsize+16*1], m7
+ IDTX16B 5, 7, 6
+ mova m7, [coeffq+32*7]
+ IDTX16B 7, 6, 6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ lea tx2q, [o(.pass1_end1)]
+ jmp .pass1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 15*16
+ lea tx2q, [o(.pass1_end2)]
+ jmp .pass1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m4
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
+ mova m4, [o(pw_2048)]
+ pmulhrsw m5, m4
+ pmulhrsw m6, m4
+ mova [rsp+gprsize+16*2], m5
+ mova m5, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*1], m6
+ IDTX16 5, 6, 7
+ mova m6, [rsp+gprsize+16*0]
+ IDTX16 6, 7, 7
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6
+ pmulhrsw m4, m5
+ mova [rsp+gprsize+16*0], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+.end2:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ LOAD_8ROWS coeffq, 32
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp .end
+
+.end3:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_8x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 2 ;pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+
+.end:
+ RET
+
+
+
+cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ cmp eobd, 106
+ jle .fast
+
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1:
+ mova [rsp+gprsize+16*9 ], m0 ;in24
+ mova [rsp+gprsize+16*10], m4 ;in28
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_1:
+ mova [rsp+gprsize+16*7 ], m0 ;in16
+ mova [rsp+gprsize+16*8 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+.fast:
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ mova [rsp+gprsize+16*5 ], m0 ;in8
+ mova [rsp+gprsize+16*6 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+ mova m1, m4 ;in4
+ mova m2, [rsp+gprsize+16*5 ] ;in8
+ mova m3, [rsp+gprsize+16*6 ] ;in12
+
+ cmp eobd, 106
+ jg .full
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call .main_fast
+ jmp .pass2
+
+.full:
+ mova m4, [rsp+gprsize+16*7 ] ;in16
+ mova m5, [rsp+gprsize+16*8 ] ;in20
+ mova m6, [rsp+gprsize+16*9 ] ;in24
+ mova m7, [rsp+gprsize+16*10] ;in28
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+ call .main
+
+.pass2:
+ lea r3, [o(.end6)]
+
+.end:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end2)]
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+
+ jmp tx2q
+
+.end2:
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end5:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end6:
+ ret
+
+ALIGN function_align
+cglobal_label .main_veryfast
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31
+ pmulhrsw m0, [o(pw_201x8)] ;t16,t17
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*20], m3 ;t17a
+ mova [rsp+gprsize*2+16*33], m0 ;t30a
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29
+ pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19
+ mova [rsp+gprsize*2+16*22], m1 ;t19
+ mova [rsp+gprsize*2+16*31], m2 ;t28
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m2 ;t18a
+ mova [rsp+gprsize*2+16*32], m1 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27
+ pmulhrsw m0, [o(pw_995x8)] ;t20, t21
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m3 ;t21a
+ mova [rsp+gprsize*2+16*29], m0 ;t26a
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pxor m0, m0
+ mova m3, m0
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main_fast ;bottom half is zero
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a
+ pmulhrsw m0, [o(pw_201x8)] ;t16a
+ pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a
+ pmulhrsw m1, [o(pw_m2751x8)] ;t17a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t17
+ paddsw m0, m1 ;t16
+ psubsw m5, m3, m2 ;t30
+ paddsw m3, m2 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m3, m0, [o(pw_3703x8)]
+ pmulhrsw m0, [o(pw_1751x8)]
+ pmulhrsw m2, m1, [o(pw_3857x8)]
+ pmulhrsw m1, [o(pw_m1380x8)]
+ psubsw m4, m1, m0 ;t18
+ paddsw m0, m1 ;t19
+ psubsw m5, m2, m3 ;t29
+ paddsw m3, m2 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ pmulhrsw m3, m0, [o(pw_3973x8)]
+ pmulhrsw m0, [o(pw_995x8)]
+ pmulhrsw m2, m1, [o(pw_3513x8)]
+ pmulhrsw m1, [o(pw_m2106x8)]
+ psubsw m4, m0, m1 ;t21
+ paddsw m0, m1 ;t20
+ psubsw m5, m3, m2 ;t26
+ paddsw m3, m2 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pmulhrsw m3, m0, [o(pw_3290x8)]
+ pmulhrsw m0, [o(pw_2440x8)]
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m7, [o(pd_2048)]
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ mova m2, [rsp+gprsize*2+16*33] ;in17
+ mova m3, [rsp+gprsize*2+16*34] ;in31
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a
+ psubsw m4, m0, m2 ;t17
+ paddsw m0, m2 ;t16
+ psubsw m5, m3, m1 ;t30
+ paddsw m3, m1 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ mova m2, [rsp+gprsize*2+16*31] ;in25
+ mova m3, [rsp+gprsize*2+16*32] ;in23
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a
+ psubsw m4, m2, m0 ;t18
+ paddsw m0, m2 ;t19
+ psubsw m5, m1, m3 ;t29
+ paddsw m3, m1 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ mova m2, [rsp+gprsize*2+16*29] ;in21
+ mova m3, [rsp+gprsize*2+16*30] ;in27
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a
+ psubsw m4, m0, m2 ;t21
+ paddsw m0, m2 ;t20
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m1, [rsp+gprsize*2+16*26] ;in3
+ mova m2, [rsp+gprsize*2+16*27] ;in29
+ mova m3, [rsp+gprsize*2+16*28] ;in19
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a
+
+.main2:
+ psubsw m4, m2, m0 ;t22
+ paddsw m0, m2 ;t23
+ psubsw m5, m1, m3 ;t25
+ paddsw m3, m1 ;t24
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a
+ mova m2, [rsp+gprsize*2+16*24] ;t21a
+ psubsw m1, m5, m2 ;t21
+ paddsw m5, m2 ;t22
+ mova [rsp+gprsize*2+16*25], m5 ;t22
+ mova m2, [rsp+gprsize*2+16*29] ;t26a
+ psubsw m5, m4, m2 ;t26
+ paddsw m4, m2 ;t25
+ mova [rsp+gprsize*2+16*28], m4 ;t25
+ ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m1 ;t26a
+
+ mova m1, [rsp+gprsize*2+16*23] ;t20
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ psubsw m2, m0, m1 ;t20a
+ paddsw m0, m1 ;t23a
+ psubsw m6, m3, m5 ;t27a
+ paddsw m3, m5 ;t24a
+ ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27
+ mova [rsp+gprsize*2+16*26], m0 ;t23a
+ mova [rsp+gprsize*2+16*27], m3 ;t24a
+ mova [rsp+gprsize*2+16*30], m2 ;t27
+
+ mova m0, [rsp+gprsize*2+16*20] ;t17a
+ mova m1, [rsp+gprsize*2+16*21] ;t18a
+ mova m2, [rsp+gprsize*2+16*32] ;t29a
+ mova m3, [rsp+gprsize*2+16*33] ;t30a
+ psubsw m4, m0, m1 ;t18
+ paddsw m0, m1 ;t17
+ psubsw m5, m3, m2 ;t29
+ paddsw m3, m2 ;t30
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a
+ mova [rsp+gprsize*2+16*20], m0 ;t17
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova [rsp+gprsize*2+16*33], m3 ;t30
+ mova m0, [rsp+gprsize*2+16*19] ;t16
+ mova m1, [rsp+gprsize*2+16*22] ;t19
+ mova m2, [rsp+gprsize*2+16*31] ;t28
+ mova m3, [rsp+gprsize*2+16*34] ;t31
+ psubsw m4, m0, m1 ;t19a
+ paddsw m0, m1 ;t16a
+ psubsw m5, m3, m2 ;t28a
+ paddsw m3, m2 ;t31a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28
+ mova m2, [rsp+gprsize*2+16*15] ;tmp12
+ psubsw m1, m5, m6 ;t20a
+ paddsw m5, m6 ;t19a
+ psubsw m6, m2, m5 ;out19
+ paddsw m2, m5 ;out12
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ mova [rsp+gprsize*2+16*22], m6 ;out19
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m6, m4, m5 ;t27a
+ paddsw m4, m5 ;t28a
+ ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp3
+ psubsw m5, m2, m4 ;out28
+ paddsw m2, m4 ;out3
+ mova m4, [rsp+gprsize*2+16*14] ;tmp11
+ mova [rsp+gprsize*2+16*31], m5 ;out28
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m5, m4, m6 ;out20
+ paddsw m4, m6 ;out11
+ mova m2, [rsp+gprsize*2+16*7 ] ;tmp4
+ mova [rsp+gprsize*2+16*23], m5 ;out20
+ mova [rsp+gprsize*2+16*14], m4 ;out11
+ psubsw m5, m2, m1 ;out27
+ paddsw m2, m1 ;out4
+ mova m1, [rsp+gprsize*2+16*26] ;t23a
+ mova m4, [rsp+gprsize*2+16*27] ;t24a
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*7 ], m2 ;out4
+ psubsw m5, m0, m1 ;t23
+ paddsw m0, m1 ;t16
+ psubsw m2, m3, m4 ;t24
+ paddsw m3, m4 ;t31
+ ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a
+ mova m6, [rsp+gprsize*2+16*18] ;tmp15
+ psubsw m4, m6, m0 ;out16
+ paddsw m6, m0 ;out15
+ mova m0, [rsp+gprsize*2+16*3 ] ;tmp0
+ mova m1, [rsp+gprsize*2+16*11] ;tmp8
+ mova [rsp+gprsize*2+16*18], m6 ;out15
+ mova [rsp+gprsize*2+16*19], m4 ;out16
+ psubsw m6, m0, m3 ;out31
+ paddsw m0, m3 ;out0
+ psubsw m4, m1, m2 ;out23
+ paddsw m1, m2 ;out8
+ mova m3, [rsp+gprsize*2+16*10] ;tmp7
+ mova [rsp+gprsize*2+16*34], m6 ;out31
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+ mova [rsp+gprsize*2+16*26], m4 ;out23
+ paddsw m6, m3, m5 ;out7
+ psubsw m3, m5 ;out24
+ mova m1, [rsp+gprsize*2+16*20] ;t17
+ mova m5, [rsp+gprsize*2+16*25] ;t22
+ mova m2, [rsp+gprsize*2+16*17] ;tmp14
+ mova [rsp+gprsize*2+16*27], m3 ;out24
+ psubsw m4, m1, m5 ;t22a
+ paddsw m1, m5 ;t17a
+ psubsw m3, m2, m1 ;out17
+ paddsw m2, m1 ;out14
+ mova m5, [rsp+gprsize*2+16*28] ;t25
+ mova m1, [rsp+gprsize*2+16*33] ;t30
+ mova [rsp+gprsize*2+16*17], m2 ;out14
+ mova [rsp+gprsize*2+16*20], m3 ;out17
+ psubsw m2, m1, m5 ;t25a
+ paddsw m1, m5 ;t30a
+ ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25
+ mova m5, [rsp+gprsize*2+16*4 ] ;tmp1
+ psubsw m3, m5, m1 ;out30
+ paddsw m5, m1 ;out1
+ mova m1, [rsp+gprsize*2+16*12] ;tmp9
+ mova [rsp+gprsize*2+16*33], m3 ;out30
+ mova [rsp+gprsize*2+16*4 ], m5 ;out1
+ psubsw m3, m1, m2 ;out22
+ paddsw m1, m2 ;out9
+ mova m5, [rsp+gprsize*2+16*9 ] ;tmp6
+ mova [rsp+gprsize*2+16*25], m3 ;out22
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+ psubsw m3, m5, m4 ;out25
+ paddsw m5, m4 ;out6
+ mova m4, [rsp+gprsize*2+16*21] ;t18a
+ mova m1, [rsp+gprsize*2+16*24] ;t21a
+ mova m2, [rsp+gprsize*2+16*16] ;tmp13
+ mova [rsp+gprsize*2+16*28], m3 ;out25
+ mova [rsp+gprsize*2+16*9 ], m5 ;out6
+ paddsw m3, m4, m1 ;t18
+ psubsw m4, m1 ;t21
+ psubsw m5, m2, m3 ;out18
+ paddsw m2, m3 ;out13
+ mova m1, [rsp+gprsize*2+16*29] ;t26a
+ mova m3, [rsp+gprsize*2+16*32] ;t29a
+ mova [rsp+gprsize*2+16*21], m5 ;out18
+ mova [rsp+gprsize*2+16*16], m2 ;out13
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t29
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a
+ mova m2, [rsp+gprsize*2+16*5 ] ;tmp2
+ psubsw m1, m2, m3 ;out29
+ paddsw m2, m3 ;out2
+ mova m3, [rsp+gprsize*2+16*13] ;tmp10
+ mova [rsp+gprsize*2+16*32], m1 ;out29
+ psubsw m7, m3, m5 ;out21
+ paddsw m3, m5 ;out10
+ mova m5, [rsp+gprsize*2+16*8 ] ;tmp5
+ mova [rsp+gprsize*2+16*24], m7 ;out21
+ mova [rsp+gprsize*2+16*13], m3 ;out10
+ psubsw m1, m5, m4 ;out26
+ paddsw m5, m4 ;out5
+ mova m7, m6 ;out7
+ mova m3, [rsp+gprsize*2+16*6 ] ;out3
+ mova m4, [rsp+gprsize*2+16*7 ] ;out4
+ mova [rsp+gprsize*2+16*29], m1 ;out26
+ mova m6, [rsp+gprsize*2+16*9 ] ;out6
+ mova m1, [rsp+gprsize*2+16*4 ] ;out1
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x8_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*1, 32
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ cmp eobd, 106
+ jg .full
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ LOAD_8ROWS coeffq+16*17, 32
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end1
+
+.end:
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end1:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end2)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end3:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end5:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end6)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end6:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end7:
+ mov dstq, r3
+ lea tx2q, [o(.end8)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end8:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+.loop:
+ LOAD_8ROWS coeffq+16*0, 64
+ paddsw m6, [o(pw_5)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_5)]
+ REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+.loop:
+ LOAD_8ROWS coeffq+16*0, 16
+ pmulhrsw m6, [o(pw_4096)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_4096)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+
+ mov [rsp+16*3], dstq
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+ call m(idct_8x8_internal_8bpc).end3
+
+ add coeffq, 16*8
+ mov dstq, [rsp+16*3]
+ lea dstq, [dstq+8]
+ dec r3d
+ jg .loop
+ jnc .loop
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 16
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*5, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*33, 64 ;in8~in15
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ mova [coeffq+16*1 ], m0 ;in8
+ mova [coeffq+16*5 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*32, 64 ;in0~in7
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+
+ cmp eobd, 150
+ jg .full
+
+ mova m1, m4 ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [rsp+gprsize+16*11] ;in2
+ mova m1, [rsp+gprsize+16*12] ;in6
+ mova m2, [rsp+gprsize+16*13] ;in10
+ mova m3, [rsp+gprsize+16*14] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ mova [coeffq+16*0 ], m0 ;in0
+ mova [coeffq+16*4 ], m4 ;in4
+
+ LOAD_8ROWS coeffq+16*2, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*6, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*34, 64 ;in16~in23
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ mova [coeffq+16*2 ], m0 ;in16
+ mova [coeffq+16*6 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+ LOAD_8ROWS coeffq+16*3, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*7, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS coeffq+16*35, 64 ;in24~in31
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ mova m6, m0 ;in24
+ mova m7, m4 ;in28
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*4 ] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*6 ] ;in20
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mov [rsp+gprsize*1+16*35], eobd
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+ lea r3, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.end:
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov eobd, [rsp+gprsize*1+16*35]
+ add coeffq, 16*32
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*16] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*17] ;in12
+
+ cmp eobd, 150
+ jg .full1
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+.full1:
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*18] ;in20
+ mova m6, [coeffq+16*3 ] ;in24
+ mova m7, [coeffq+16*19] ;in26
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ mova m4, [coeffq+16*10] ;in18
+ mova m5, [coeffq+16*26] ;in22
+ mova m6, [coeffq+16*11] ;in26
+ mova m7, [coeffq+16*27] ;in30
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x16_internal_8bpc)
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*11, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*19, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*27, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 16
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16
+ lea r3, [o(.pass1_end1)]
+.pass1:
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*2, 64, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ LOAD_8ROWS coeffq+16*34, 64, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass1_end:
+ mova [rsp+gprsize+16*0 ], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+16*32, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*48, 32
+
+ sub coeffq, 16
+ lea r3, [o(.end)]
+ jmp .pass1
+
+.end:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, eobd
+ cmp eobd, 43 ;if (eob > 43)
+ sbb r3d, r3d ; iteration_count++
+ cmp r4d, 150 ;if (eob > 150)
+ sbb r3d, 0 ; iteration_count++
+ cmp r4d, 278 ;if (eob > 278)
+ sbb r3d, -4 ; iteration_count++
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ mov [rsp+gprsize+16*3], r3d
+ mov [rsp+gprsize*2+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64, 1
+ mova [rsp+16*1], m6
+ pxor m6, m6
+ REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*0], m2
+ mova [rsp+16*1], m3
+ mova [rsp+16*2], m4
+ mova m3, [o(pw_1697x16)]
+ mova m4, [o(pw_16384)]
+ REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
+ mova m2, [o(pw_8192)]
+ REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1
+ mova m2, [rsp+16*0]
+ mova [rsp+16*0], m7
+ IDTX16 2, 7, 3, 4
+ mova m7, [rsp+16*2]
+ mova [rsp+16*2], m5
+ IDTX16 7, 5, 3, 4
+ mova m5, [rsp+16*1]
+ mova [rsp+16*1], m6
+ pmulhrsw m3, m5
+ pmulhrsw m3, m4
+ psrlw m4, 1 ; pw_8192
+ paddsw m3, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pmulhrsw m4, m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ mov coeffq, [rsp+gprsize*2+16*3]
+ add coeffq, 64*8
+ mov r3d, [rsp+gprsize+16*3]
+ xor dstq, dstq
+ mov [rsp+gprsize+16*3], dstq
+ mov dstq, [rsp+16*3]
+ test r3d, r3d
+ jnz .loop
+ RET
+
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, 12 ;0100b
+ mov r5d, 136 ;1000 1000b
+ cmp eobd, 44 ;if (eob > 43)
+ cmovns r4d, r5d ; iteration_count+2
+ cmp eobd, 151 ;if (eob > 150)
+ mov r3d, 34952 ;1000 1000 1000 1000b
+ cmovs r3d, r4d ; iteration_count += 4
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+
+.loop:
+ LOAD_8ROWS coeffq, 32, 1
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*1], m5
+ mova [rsp+16*2], m6
+ mova m6, [o(pw_1697x16)]
+ REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
+ pmulhrsw m7, [o(pw_2048)]
+ mova m5, [rsp+16*1]
+ mova [rsp+16*0], m7
+ IDTX16 5, 7, 6
+ mova m7, [rsp+16*2]
+ IDTX16 7, 6, 6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+.loop_end:
+ add coeffq, 16
+ shr r3d, 2
+ jz .ret
+ test r3d, 2
+ jnz .loop
+ mov r4d, r3d
+ and r4d, 1
+ lea coeffq, [coeffq+r4*8+32*7]
+ mov dstq, [rsp+16*3]
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*35], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*35], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*35]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ mova m0, [coeffq+256*0]
+ mova m1, [coeffq+256*1]
+ mova m2, [coeffq+256*2]
+ mova m3, [coeffq+256*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+128*1]
+ mova m1, [coeffq+128*3]
+ mova m2, [coeffq+128*5]
+ mova m3, [coeffq+128*7]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*35]
+ mov r3d, 4
+ lea tx2q, [o(.pass2_end)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*35], r3d
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*12]
+ mova m2, [coeffq+16*20]
+ mova m3, [coeffq+16*28]
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*13]
+ mova m6, [coeffq+16*21]
+ mova m7, [coeffq+16*29]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov eobd, [rsp+gprsize*1+16*35]
+ test eobd, eobd
+ jl .fast1
+
+.full1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ mova m4, [coeffq+16*2 ]
+ mova m5, [coeffq+16*18]
+ mova m6, [coeffq+16*3 ]
+ mova m7, [coeffq+16*19]
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*22]
+ mova m3, [coeffq+16*30]
+ mova m4, [coeffq+16*7 ]
+ mova m5, [coeffq+16*15]
+ mova m6, [coeffq+16*23]
+ mova m7, [coeffq+16*31]
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp tx2q
+
+.fast1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp tx2q
+
+.pass2_end:
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg .pass2_loop
+
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ cmp eobd, 136
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*0+16*3], r4
+ mov [rsp+gprsize*1+16*3], r3d
+ mov [rsp+gprsize*2+16*3], r3d
+ mov [rsp+gprsize*3+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ pmulhrsw m7, [o(pw_8192)]
+ mova [rsp+16*0], m7
+ mova m7, [o(pw_8192)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+16*1], m6
+ mova [rsp+16*2], m5
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+ add coeffq, 16
+ dec r3d
+ jg .loop
+
+ mov r4d, [rsp+gprsize*2+16*3]
+ dec r4d
+ jle .ret
+
+ mov dstq, [rsp+gprsize*0+16*3]
+ mov coeffq, [rsp+gprsize*3+16*3]
+ mov [rsp+gprsize*2+16*3], r4
+ lea r3, [dstq+8]
+ add coeffq, 64*8
+ mov [rsp+gprsize*0+16*3], r3
+ mov r3d, [rsp+gprsize*1+16*3]
+ mov [rsp+gprsize*3+16*3], coeffq
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 151
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*0, 64*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*1, 64*2
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 2
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*67], r3d
+ mov eobd, [rsp+gprsize*1+16*67]
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ pxor m4, m4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+
+ test eobd, eobd
+ jl .fast
+
+.full:
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+ mova m2, [coeffq+16*18]
+ mova m3, [coeffq+16*19]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call .main
+ jmp .end
+
+.fast:
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+
+ call m(idct_8x32_internal_8bpc).main_veryfast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ call .main_fast
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, r4
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.end1:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ call .write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+ dec r3d
+ jg .pass2_loop
+ ret
+.write:
+ mova [r3+16*0], m7
+ mov r4, -16*32
+ pxor m7, m7
+ sub coeffq, r4
+.zero_loop:
+ mova [coeffq+r4+16*0], m7
+ mova [coeffq+r4+16*1], m7
+ add r4, 16*2
+ jl .zero_loop
+ call .write_main2
+ LOAD_8ROWS r3+16*11, 16
+ call .write_main
+ LOAD_8ROWS r3+16*19, 16
+ call .write_main
+ LOAD_8ROWS r3+16*27, 16
+.write_main:
+ mova [r3+16*0], m7
+.write_main2:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [r3+16*0]
+ mova [r3+16*2], m5
+ mova [r3+16*1], m6
+ mova [r3+16*0], m7
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ ret
+
+
+ALIGN function_align
+cglobal_label .main_fast
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63
+ pmulhrsw m0, [o(pw_101x8)] ;t32,t33
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*36], m3 ;t33a
+ mova [rsp+gprsize*2+16*65], m0 ;t62a
+
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61
+ pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35
+ mova [rsp+gprsize*2+16*38], m1 ;t35
+ mova [rsp+gprsize*2+16*63], m2 ;t60
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m2 ;t34a
+ mova [rsp+gprsize*2+16*64], m1 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59
+ pmulhrsw m0, [o(pw_897x8)] ;t36,t37
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*40], m3 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57
+ pmulhrsw m1, [o(pw_m700x8)] ;t38,t39
+ mova [rsp+gprsize*2+16*42], m1 ;t39
+ mova [rsp+gprsize*2+16*59], m2 ;t56
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m2 ;t38a
+ mova [rsp+gprsize*2+16*60], m1 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55
+ pmulhrsw m0, [o(pw_501x8)] ;t40,t41
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*44], m3 ;t41a
+ mova [rsp+gprsize*2+16*57], m0 ;t54a
+
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53
+ pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43
+ mova [rsp+gprsize*2+16*46], m1 ;t43
+ mova [rsp+gprsize*2+16*55], m2 ;t52
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m2 ;t42a
+ mova [rsp+gprsize*2+16*56], m1 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51
+ pmulhrsw m0, [o(pw_1285x8)] ;t44,t45
+ mova m6, m0
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m3 ;t45a
+ mova [rsp+gprsize*2+16*53], m0 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49
+ pmulhrsw m0, [o(pw_m301x8)] ;t46,t47
+ mova m4, m3
+ mova m5, m0
+
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ mova m1, [rsp+gprsize*2+16*65] ;in31
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a
+ pmulhrsw m0, [o(pw_101x8)] ;t32a
+ pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a
+ pmulhrsw m1, [o(pw_m2824x8)] ;t33a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t33
+ paddsw m0, m1 ;t32
+ psubsw m5, m3, m2 ;t62
+ paddsw m3, m2 ;t63
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*36], m5 ;t33a
+ mova [rsp+gprsize*2+16*65], m4 ;t62a
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+
+ mova m0, [rsp+gprsize*2+16*63] ;in17
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a
+ pmulhrsw m0, [o(pw_1660x8)] ;t34a
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a
+ pmulhrsw m1, [o(pw_m1474x8)] ;t35a
+ psubsw m4, m1, m0 ;t34
+ paddsw m0, m1 ;t35
+ psubsw m5, m2, m3 ;t61
+ paddsw m3, m2 ;t60
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m5 ;t34a
+ mova [rsp+gprsize*2+16*38], m0 ;t35
+ mova [rsp+gprsize*2+16*63], m3 ;t60
+ mova [rsp+gprsize*2+16*64], m4 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ mova m1, [rsp+gprsize*2+16*61] ;in23
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a
+ pmulhrsw m0, [o(pw_897x8)] ;t36a
+ pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a
+ pmulhrsw m1, [o(pw_m2191x8)] ;t37a
+ psubsw m4, m0, m1 ;t37
+ paddsw m0, m1 ;t36
+ psubsw m5, m3, m2 ;t58
+ paddsw m3, m2 ;t59
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+
+ mova m0, [rsp+gprsize*2+16*59] ;in25
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a
+ pmulhrsw m0, [o(pw_2359x8)] ;t38a
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a
+ pmulhrsw m1, [o(pw_m700x8)] ;t39a
+ psubsw m4, m1, m0 ;t38
+ paddsw m0, m1 ;t39
+ psubsw m5, m2, m3 ;t57
+ paddsw m3, m2 ;t56
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m5 ;t38a
+ mova [rsp+gprsize*2+16*42], m0 ;t39
+ mova [rsp+gprsize*2+16*59], m3 ;t56
+ mova [rsp+gprsize*2+16*60], m4 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ mova m1, [rsp+gprsize*2+16*57] ;in27
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a
+ pmulhrsw m0, [o(pw_501x8)] ;t40a
+ pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a
+ pmulhrsw m1, [o(pw_m2520x8)] ;t41a
+ psubsw m4, m0, m1 ;t41
+ paddsw m0, m1 ;t40
+ psubsw m5, m3, m2 ;t54
+ paddsw m3, m2 ;t55
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*44], m5 ;t41a
+ mova [rsp+gprsize*2+16*57], m4 ;t54a
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+
+ mova m0, [rsp+gprsize*2+16*55] ;in21
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a
+ pmulhrsw m0, [o(pw_2019x8)] ;t42a
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a
+ pmulhrsw m1, [o(pw_m1092x8)] ;t43a
+ psubsw m4, m1, m0 ;t42
+ paddsw m0, m1 ;t43
+ psubsw m5, m2, m3 ;t53
+ paddsw m3, m2 ;t52
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*46], m0 ;t43
+ mova [rsp+gprsize*2+16*55], m3 ;t52
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ mova m1, [rsp+gprsize*2+16*53] ;in19
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a
+ pmulhrsw m0, [o(pw_1285x8)] ;t44a
+ pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a
+ pmulhrsw m1, [o(pw_m1842x8)] ;t45a
+ psubsw m4, m0, m1 ;t45
+ paddsw m0, m1 ;t44
+ psubsw m5, m3, m2 ;t50
+ paddsw m3, m2 ;t51
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova m6, m0
+ mova [rsp+gprsize*2+16*48], m5 ;t45a
+ mova [rsp+gprsize*2+16*53], m4 ;t50a
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+
+ mova m0, [rsp+gprsize*2+16*51] ;in29
+ mova m1, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a
+ pmulhrsw m0, [o(pw_2675x8)] ;t46a
+ pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a
+ pmulhrsw m1, [o(pw_m301x8)] ;t47a
+ psubsw m5, m1, m0 ;t46
+ paddsw m0, m1 ;t47
+ psubsw m4, m2, m3 ;t49
+ paddsw m3, m2 ;t48
+
+ALIGN function_align
+.main2:
+ ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m2, m0, m6 ;t44a
+ paddsw m0, m6 ;t47a
+ psubsw m6, m3, m1 ;t51a
+ paddsw m3, m1 ;t48a
+ mova [rsp+gprsize*2+16*50], m0 ;t47a
+ mova [rsp+gprsize*2+16*51], m3 ;t48a
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51
+ mova [rsp+gprsize*2+16*47], m6 ;t44
+ mova [rsp+gprsize*2+16*54], m2 ;t51
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m3, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m2, m4, m0 ;t45
+ paddsw m4, m0 ;t46
+ psubsw m6, m5, m3 ;t50
+ paddsw m5, m3 ;t49
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m6 ;t45a
+ mova [rsp+gprsize*2+16*49], m4 ;t46
+ mova [rsp+gprsize*2+16*52], m5 ;t49
+ mova [rsp+gprsize*2+16*53], m2 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*43] ;t40
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*58] ;t55
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t40a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t55a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52
+ mova [rsp+gprsize*2+16*43], m0 ;t40a
+ mova [rsp+gprsize*2+16*46], m5 ;t43
+ mova [rsp+gprsize*2+16*55], m4 ;t52
+ mova [rsp+gprsize*2+16*58], m1 ;t55a
+
+ mova m0, [rsp+gprsize*2+16*44] ;t41a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*57] ;t54a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t41
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t54
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a
+ mova [rsp+gprsize*2+16*44], m0 ;t41
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*57], m1 ;t54
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38a
+ mova m2, [rsp+gprsize*2+16*40] ;t37a
+ mova m3, [rsp+gprsize*2+16*61] ;t58a
+ mova m1, [rsp+gprsize*2+16*60] ;t57a
+ psubsw m4, m0, m2 ;t37
+ paddsw m0, m2 ;t38
+ psubsw m5, m1, m3 ;t58
+ paddsw m1, m3 ;t57
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a
+ mova [rsp+gprsize*2+16*41], m0 ;t38
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*60], m1 ;t57
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*59] ;t56
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t39a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t56a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59
+ mova [rsp+gprsize*2+16*42], m0 ;t39a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*59], m1 ;t56a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m2, [rsp+gprsize*2+16*38] ;t35
+ mova m3, [rsp+gprsize*2+16*63] ;t60
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ psubsw m4, m0, m2 ;t35a
+ paddsw m0, m2 ;t32a
+ psubsw m5, m1, m3 ;t60a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60
+ mova [rsp+gprsize*2+16*35], m0 ;t32a
+ mova [rsp+gprsize*2+16*38], m5 ;t35
+ mova [rsp+gprsize*2+16*63], m4 ;t60
+ mova [rsp+gprsize*2+16*66], m1 ;t63a
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m2, [rsp+gprsize*2+16*37] ;t34a
+ mova m3, [rsp+gprsize*2+16*64] ;t61a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ psubsw m4, m0, m2 ;t34
+ paddsw m0, m2 ;t33
+ psubsw m5, m1, m3 ;t61
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a
+
+ mova m2, [rsp+gprsize*2+16*41] ;t38
+ mova m3, [rsp+gprsize*2+16*60] ;t57
+ psubsw m6, m0, m2 ;t38a
+ paddsw m0, m2 ;t33a
+ psubsw m2, m1, m3 ;t57a
+ paddsw m1, m3 ;t62a
+ mova [rsp+gprsize*2+16*36], m0 ;t33a
+ mova [rsp+gprsize*2+16*65], m1 ;t62a
+ ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57
+ mova [rsp+gprsize*2+16*41], m2 ;t38
+ mova [rsp+gprsize*2+16*60], m6 ;t57
+
+ mova m2, [rsp+gprsize*2+16*40] ;t37
+ mova m3, [rsp+gprsize*2+16*61] ;t58
+ psubsw m0, m5, m2 ;t37
+ paddsw m5, m2 ;t34
+ psubsw m1, m4, m3 ;t58
+ paddsw m4, m3 ;t61
+ ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a
+ mova [rsp+gprsize*2+16*37], m5 ;t34
+ mova [rsp+gprsize*2+16*64], m4 ;t61
+ mova [rsp+gprsize*2+16*40], m1 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m0, [rsp+gprsize*2+16*38] ;t35
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*63] ;t60
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t35a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t60a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59
+ mova [rsp+gprsize*2+16*38], m0 ;t35a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*63], m1 ;t60a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32a
+ mova m2, [rsp+gprsize*2+16*42] ;t39a
+ mova m3, [rsp+gprsize*2+16*59] ;t56a
+ mova m1, [rsp+gprsize*2+16*66] ;t63a
+ psubsw m4, m0, m2 ;t39
+ paddsw m0, m2 ;t32
+ psubsw m5, m1, m3 ;t56
+ paddsw m1, m3 ;t63
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*42], m5 ;t39a
+ mova [rsp+gprsize*2+16*59], m4 ;t56a
+ mova [rsp+gprsize*2+16*66], m1 ;t63
+
+ mova m0, [rsp+gprsize*2+16*50] ;t47a
+ mova m2, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*51] ;t48a
+ psubsw m4, m0, m2 ;t40
+ paddsw m0, m2 ;t47
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t48
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a
+ mova [rsp+gprsize*2+16*50], m0 ;t47
+ mova [rsp+gprsize*2+16*43], m5 ;t40a
+ mova [rsp+gprsize*2+16*58], m4 ;t55a
+ mova [rsp+gprsize*2+16*51], m1 ;t48
+
+ mova m0, [rsp+gprsize*2+16*49] ;t46
+ mova m2, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*52] ;t49
+ psubsw m4, m0, m2 ;t41a
+ paddsw m0, m2 ;t46a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t49a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54
+ mova [rsp+gprsize*2+16*49], m0 ;t46a
+ mova [rsp+gprsize*2+16*44], m5 ;t41
+ mova [rsp+gprsize*2+16*57], m4 ;t54
+ mova [rsp+gprsize*2+16*52], m1 ;t49a
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t45
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t50
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a
+ mova [rsp+gprsize*2+16*48], m0 ;t45
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*53], m1 ;t50
+
+ mova m0, [rsp+gprsize*2+16*47] ;t44
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t44a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t51a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52
+
+ mova m2, [rsp+gprsize*2+16*38] ;t35a
+ mova m3, [rsp+gprsize*2+16*31] ;tmp[28]
+ psubsw m6, m2, m0 ;t44
+ paddsw m2, m0 ;t35
+ psubsw m0, m3, m2 ;out35
+ paddsw m2, m3 ;out28
+ mova m3, [rsp+gprsize*2+16*63] ;t60a
+ mova [rsp+gprsize*2+16*38], m0 ;out35
+ mova [rsp+gprsize*2+16*31], m2 ;out28
+ psubsw m0, m3, m1 ;t51
+ paddsw m3, m1 ;t60
+ ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
+ psubsw m1, m2, m3 ;out60
+ paddsw m2, m3 ;out3
+ mova m3, [rsp+gprsize*2+16*22] ;tmp[19]
+ mova [rsp+gprsize*2+16*63], m1 ;out60
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m1, m3, m0 ;out44
+ paddsw m3, m0 ;out19
+ mova m2, [rsp+gprsize*2+16*15] ;tmp[12]
+
+ mova m0, [rsp+gprsize*2+16*39] ;t36
+ mova [rsp+gprsize*2+16*47], m1 ;out44
+ mova [rsp+gprsize*2+16*22], m3 ;out19
+ mova m1, [rsp+gprsize*2+16*62] ;t59
+ psubsw m3, m2, m6 ;out51
+ paddsw m2, m6 ;out12
+ mova [rsp+gprsize*2+16*54], m3 ;out51
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m2, m0, m5 ;t43a
+ paddsw m0, m5 ;t36a
+ mova m5, [rsp+gprsize*2+16*30] ;tmp[27]
+ psubsw m3, m1, m4 ;t52a
+ paddsw m1, m4 ;t59a
+ ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52
+ mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ]
+ psubsw m6, m5, m0 ;out36
+ paddsw m5, m0 ;out27
+ psubsw m0, m4, m1 ;out59
+ paddsw m4, m1 ;out4
+ mova [rsp+gprsize*2+16*39], m6 ;out36
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*62], m0 ;out59
+ mova [rsp+gprsize*2+16*7 ], m4 ;out4
+ mova m0, [rsp+gprsize*2+16*23] ;tmp[20]
+ mova m5, [rsp+gprsize*2+16*14] ;tmp[11]
+ psubsw m4, m0, m3 ;out43
+ paddsw m0, m3 ;out20
+ psubsw m6, m5, m2 ;out52
+ paddsw m5, m2 ;out11
+ mova [rsp+gprsize*2+16*46], m4 ;out43
+ mova [rsp+gprsize*2+16*23], m0 ;out20
+ mova [rsp+gprsize*2+16*55], m6 ;out52
+ mova [rsp+gprsize*2+16*14], m5 ;out11
+
+ mova m0, [rsp+gprsize*2+16*40] ;t37a
+ mova m5, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*61] ;t58a
+ mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
+ psubsw m4, m0, m5 ;t42
+ paddsw m0, m5 ;t37
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t58
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52
+ mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ]
+ psubsw m6, m2, m0 ;out37
+ paddsw m2, m0 ;out26
+ psubsw m0, m3, m1 ;out58
+ paddsw m3, m1 ;out5
+ mova [rsp+gprsize*2+16*40], m6 ;out37
+ mova [rsp+gprsize*2+16*29], m2 ;out26
+ mova [rsp+gprsize*2+16*61], m0 ;out58
+ mova [rsp+gprsize*2+16*8 ], m3 ;out5
+ mova m0, [rsp+gprsize*2+16*24] ;tmp[21]
+ mova m1, [rsp+gprsize*2+16*13] ;tmp[10]
+ psubsw m2, m0, m5 ;out42
+ paddsw m0, m5 ;out21
+ psubsw m3, m1, m4 ;out53
+ paddsw m1, m4 ;out10
+ mova [rsp+gprsize*2+16*45], m2 ;out42
+ mova [rsp+gprsize*2+16*24], m0 ;out21
+ mova [rsp+gprsize*2+16*56], m3 ;out53
+ mova [rsp+gprsize*2+16*13], m1 ;out10
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38
+ mova m5, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*60] ;t57
+ mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
+ psubsw m4, m0, m5 ;t41a
+ paddsw m0, m5 ;t38a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t57a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a
+ mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ]
+ psubsw m6, m2, m0 ;out38
+ paddsw m2, m0 ;out25
+ psubsw m0, m3, m1 ;out57
+ paddsw m3, m1 ;out6
+ mova [rsp+gprsize*2+16*41], m6 ;out38
+ mova [rsp+gprsize*2+16*28], m2 ;out25
+ mova [rsp+gprsize*2+16*60], m0 ;out57
+ mova [rsp+gprsize*2+16*9 ], m3 ;out6
+ mova m0, [rsp+gprsize*2+16*25] ;tmp[22]
+ mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ]
+ psubsw m2, m0, m5 ;out41
+ paddsw m0, m5 ;out22
+ psubsw m3, m1, m4 ;out54
+ paddsw m1, m4 ;out9
+ mova [rsp+gprsize*2+16*44], m2 ;out41
+ mova [rsp+gprsize*2+16*25], m0 ;out22
+ mova [rsp+gprsize*2+16*57], m3 ;out54
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39a
+ mova m5, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*59] ;t56a
+ mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
+ psubsw m4, m0, m5 ;t40
+ paddsw m0, m5 ;t39
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t56
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a
+ mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ]
+ psubsw m6, m2, m0 ;out39
+ paddsw m2, m0 ;out24
+ psubsw m0, m3, m1 ;out56
+ paddsw m3, m1 ;out7
+ mova [rsp+gprsize*2+16*42], m6 ;out39
+ mova [rsp+gprsize*2+16*27], m2 ;out24
+ mova [rsp+gprsize*2+16*59], m0 ;out56
+ mova [rsp+gprsize*2+16*10], m3 ;out7
+ mova m0, [rsp+gprsize*2+16*26] ;tmp[23]
+ mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ]
+ psubsw m2, m0, m5 ;out40
+ paddsw m0, m5 ;out23
+ psubsw m3, m1, m4 ;out55
+ paddsw m1, m4 ;out8
+ mova [rsp+gprsize*2+16*43], m2 ;out40
+ mova [rsp+gprsize*2+16*26], m0 ;out23
+ mova [rsp+gprsize*2+16*58], m3 ;out55
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+
+ mova m0, [rsp+gprsize*2+16*37] ;t34
+ mova m5, [rsp+gprsize*2+16*48] ;t45
+ mova m3, [rsp+gprsize*2+16*53] ;t50
+ mova m1, [rsp+gprsize*2+16*64] ;t61
+ mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
+ psubsw m4, m0, m5 ;t45a
+ paddsw m0, m5 ;t34a
+ psubsw m5, m1, m3 ;t50a
+ paddsw m1, m3 ;t61a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ]
+ psubsw m6, m2, m0 ;out34
+ paddsw m2, m0 ;out29
+ psubsw m0, m3, m1 ;out61
+ paddsw m3, m1 ;out2
+ mova [rsp+gprsize*2+16*37], m6 ;out34
+ mova [rsp+gprsize*2+16*32], m2 ;out29
+ mova [rsp+gprsize*2+16*64], m0 ;out61
+ mova [rsp+gprsize*2+16*5 ], m3 ;out2
+ mova m0, [rsp+gprsize*2+16*21] ;tmp[18]
+ mova m1, [rsp+gprsize*2+16*16] ;tmp[13]
+ psubsw m2, m0, m5 ;out45
+ paddsw m0, m5 ;out18
+ psubsw m3, m1, m4 ;out50
+ paddsw m1, m4 ;out13
+ mova [rsp+gprsize*2+16*48], m2 ;out45
+ mova [rsp+gprsize*2+16*21], m0 ;out18
+ mova [rsp+gprsize*2+16*53], m3 ;out50
+ mova [rsp+gprsize*2+16*16], m1 ;out13
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m5, [rsp+gprsize*2+16*49] ;t46a
+ mova m3, [rsp+gprsize*2+16*52] ;t49a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
+ psubsw m4, m0, m5 ;t46
+ paddsw m0, m5 ;t33
+ psubsw m5, m1, m3 ;t49
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ]
+ psubsw m6, m2, m0 ;out33
+ paddsw m2, m0 ;out30
+ psubsw m0, m3, m1 ;out62
+ paddsw m3, m1 ;out1
+ mova [rsp+gprsize*2+16*36], m6 ;out33
+ mova [rsp+gprsize*2+16*33], m2 ;out30
+ mova [rsp+gprsize*2+16*65], m0 ;out62
+ mova [rsp+gprsize*2+16*4 ], m3 ;out1
+ mova m0, [rsp+gprsize*2+16*20] ;tmp[17]
+ mova m1, [rsp+gprsize*2+16*17] ;tmp[14]
+ psubsw m2, m0, m5 ;out46
+ paddsw m0, m5 ;out17
+ psubsw m3, m1, m4 ;out49
+ paddsw m1, m4 ;out14
+ mova [rsp+gprsize*2+16*49], m2 ;out46
+ mova [rsp+gprsize*2+16*20], m0 ;out17
+ mova [rsp+gprsize*2+16*52], m3 ;out49
+ mova [rsp+gprsize*2+16*17], m1 ;out14
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m5, [rsp+gprsize*2+16*50] ;t47
+ mova m3, [rsp+gprsize*2+16*51] ;t48
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
+ psubsw m4, m0, m5 ;t47a
+ paddsw m0, m5 ;t32a
+ psubsw m5, m1, m3 ;t48a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48
+ mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ]
+ psubsw m6, m2, m0 ;out32
+ paddsw m2, m0 ;out31
+ psubsw m0, m3, m1 ;out63
+ paddsw m3, m1 ;out0
+ mova [rsp+gprsize*2+16*35], m6 ;out32
+ mova [rsp+gprsize*2+16*34], m2 ;out31
+ mova [rsp+gprsize*2+16*66], m0 ;out63
+ mova [rsp+gprsize*2+16*3 ], m3 ;out0
+ mova m0, [rsp+gprsize*2+16*19] ;tmp[16]
+ mova m1, [rsp+gprsize*2+16*18] ;tmp[15]
+ psubsw m2, m0, m5 ;out47
+ paddsw m0, m5 ;out16
+ psubsw m3, m1, m4 ;out48
+ paddsw m1, m4 ;out15
+ mova [rsp+gprsize*2+16*50], m2 ;out47
+ mova [rsp+gprsize*2+16*19], m0 ;out16
+ mova [rsp+gprsize*2+16*51], m3 ;out48
+ mova [rsp+gprsize*2+16*18], m1 ;out15
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x16_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 16
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m7, m7
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ mova m5, [dstq+16*2]
+ mova m6, [dstq+16*3]
+ punpckhbw m2, m1, m7
+ punpcklbw m1, m7
+ punpckhbw m4, m3, m7
+ punpcklbw m3, m7
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ punpckhbw m2, m5, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m6, m7
+ punpcklbw m6, m7
+ paddw m2, m0
+ paddw m5, m0
+ paddw m4, m0
+ paddw m6, m0
+ packuswb m5, m2
+ packuswb m6, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ mova [dstq+16*2], m5
+ mova [dstq+16*3], m6
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
+
+%if %3
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [%1+%2*0]
+ pmulhrsw m1, m3, [%1+%2*1]
+ pmulhrsw m2, m3, [%1+%2*2]
+ pmulhrsw m3, [%1+%2*3]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+%endif
+%endmacro
+
+%macro LOAD_4ROWS_H 2 ;src, stride
+ mova m4, [%1+%2*0]
+ mova m5, [%1+%2*1]
+ mova m6, [%1+%2*2]
+ mova m7, [%1+%2*3]
+%endmacro
+
+cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r3d, 2
+ mov [rsp+gprsize*2+16*67], dstq
+ lea dstq, [rsp+gprsize+16*68]
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+32*0, 32*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+32*4, 32*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+32*2, 32*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+32*1, 32*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+32*17, 32*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+32*24, 32
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+32*24, 32
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*2+16*67]
+ sub coeffq, 32
+ mov r3d, 4
+
+.pass2_loop:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop
+
+ mov r3d, 4
+ lea coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop2
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 64
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*67]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ LOAD_4ROWS coeffq, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+128*1, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_64x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+
+cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*4+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8, 1
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov eobd, [rsp+gprsize*1+16*67]
+ lea dstq, [dstq+32]
+ mov [rsp+gprsize*1+16*35], eobd
+ lea tx2q, [o(.pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end:
+ mova [rsp+gprsize+16*0], m7
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x64_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 64
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov r4d, 2
+ sub eobd, 136
+ cmovns r4d, r5d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, r4d
+ mov [rsp+gprsize*4+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*2+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea dstq, [dstq+32]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ mova [rsp+gprsize+16*0], m7
+ call m(idct_16x64_internal_8bpc).write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+
+ dec r3d
+ jg m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ sub dstq, 72
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
diff --git a/third_party/dav1d/src/x86/loopfilter.h b/third_party/dav1d/src/x86/loopfilter.h
new file mode 100644
index 0000000000..33c842a9ce
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);
+decl_loopfilter_sb_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
new file mode 100644
index 0000000000..ed83000ac2
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
@@ -0,0 +1,1161 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+ times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 16 dw 1
+pw_2: times 16 dw 2
+pw_3: times 16 dw 3
+pw_4096: times 2 dw 4096
+
+; 10bpc/12bpc:
+pw_4: times 2 dw 4
+ times 2 dw 16
+clip_max: times 2 dw 511
+ times 2 dw 2047
+clip_min: times 2 dw -512
+ times 2 dw -2048
+
+SECTION .text
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; xmm%1 a b c d e f g h a i q y 6 E M U
+; xmm%2 i j k l m n o p b j r z 7 F N V
+; xmm%3 q r s t u v w x c k s 0 8 G O W
+; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; xmm%6 E F G H I J K L f n v 3 B J R Z
+; xmm%7 M N O P Q R S T g o w 4 C K S +
+; xmm%8 U V W X Y Z + = h p x 5 D L T =
+%macro TRANSPOSE8X8W 9
+ ; xmm%1 a b c d e f g h a i q y b j r z
+ ; xmm%2 i j k l m n o p c k s 0 d l t 1
+ ; xmm%3 q r s t u v w x -> e m u 2 f n v 3
+ ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; xmm%6 E F G H I J K L 8 G O W 9 H P X
+ ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; xmm%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; xmm%1 a i q y b j r z a i q y 6 E M U
+ ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; xmm%7 A I Q Y B J R Z g o w 4 C K S +
+ ; xmm%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+
+; transpose and write m3-6, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x16 0
+ ; transpose 8x4
+ punpcklwd m0, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckldq m6, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m3, m5
+ punpckhdq m3, m5
+
+ ; write out
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm6, m6, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm4, m4, 1
+ vextracti128 xm3, m3, 1
+
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movq xm3, [dstq+strideq*0-4]
+ movq xm4, [dstq+strideq*1-4]
+ movq xm5, [dstq+strideq*2-4]
+ movq xm6, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq xm11, [tmpq+strideq*0-4]
+ movq xm13, [tmpq+strideq*1-4]
+ movq xm14, [tmpq+strideq*2-4]
+ movq xm15, [tmpq+stride3q -4]
+ lea tmpq, [tmpq+strideq*4]
+ ; this overreads by 8 bytes but the buffers are padded
+ ; so that should be ok
+ vinserti128 m3, [tmpq+strideq*0-4], 1
+ vinserti128 m4, [tmpq+strideq*1-4], 1
+ vinserti128 m5, [tmpq+strideq*2-4], 1
+ vinserti128 m6, [tmpq+stride3q -4], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-4], 1
+ vinserti128 m13, [tmpq+strideq*1-4], 1
+ vinserti128 m14, [tmpq+strideq*2-4], 1
+ vinserti128 m15, [tmpq+stride3q -4], 1
+
+ ; transpose 4x8
+ ; xm3: A-D0,A-D4
+ ; xm4: A-D1,A-D5
+ ; xm5: A-D2,A-D6
+ ; xm6: A-D3,A-D7
+ punpcklwd m7, m3, m4
+ punpcklwd m3, m11, m13
+ punpcklwd m4, m5, m6
+ punpcklwd m5, m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: A4-5,B4-5,C4-5,D4-5
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: A6-7,B6-7,C6-7,D6-7
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m3, m5
+ punpckhdq m5, m3, m5
+ ; xm6: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm8: A4-7,B4-7
+ ; xm5: C4-7,D4-7
+ punpcklqdq m3, m6, m8
+ punpckhqdq m4, m6, m8
+ punpckhqdq m6, m7, m5
+ punpcklqdq m5, m7, m5
+ ; xm3: A0-7
+ ; xm4: B0-7
+ ; xm5: C0-7
+ ; xm6: D0-7
+%elif %1 == 6 || %1 == 8
+ movu xm3, [dstq+strideq*0-8]
+ movu xm4, [dstq+strideq*1-8]
+ movu xm5, [dstq+strideq*2-8]
+ movu xm6, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm11, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m3, [tmpq+strideq*0-8], 1
+ vinserti128 m4, [tmpq+strideq*1-8], 1
+ vinserti128 m5, [tmpq+strideq*2-8], 1
+ vinserti128 m6, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm11: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklwd m7, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m11, m13
+ punpckhwd m11, m13
+ punpcklwd m13, m14, m15
+ punpckhwd m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: E0-1,F0-1,G0-1,H0-1
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: E2-3,F2-3,G2-3,H2-3
+ ; xm6: A4-5,B4-5,C4-5,D4-5
+ ; xm11: E4-5,F4-5,G4-5,H4-5
+ ; xm13: A6-7,B6-7,C6-7,D6-7
+ ; xm14: E6-7,F6-7,G6-7,H6-7
+ punpckldq m15, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m3, m5
+ punpckhdq m8, m3, m5
+ punpckldq m3, m6, m13
+ punpckhdq m6, m13
+ punpckldq m10, m11, m14
+ punpckhdq m11, m14
+ ; xm15: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm9: E0-3,F0-3
+ ; xm8: G0-3,H0-3
+ ; xm3: A4-7,B4-7
+ ; xm6: C4-7,D4-7
+ ; xm10: E4-7,F4-7
+ ; xm11: G4-7,H4-7
+%if %1 != 6
+ punpcklqdq m0, m15, m3
+%endif
+ punpckhqdq m13, m15, m3
+ punpcklqdq m3, m7, m6
+ punpckhqdq m4, m7, m6
+ punpcklqdq m5, m9, m10
+ punpckhqdq m6, m9, m10
+ punpcklqdq m14, m8, m11
+%if %1 != 6
+ punpckhqdq m15, m8, m11
+ mova [rsp+5*32], m0
+%endif
+%else
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova xm0, [dstq+strideq*0-16]
+ mova xm1, [dstq+strideq*1-16]
+ mova xm2, [dstq+strideq*2-16]
+ mova xm3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova xm4, [tmpq+strideq*0-16]
+ mova xm5, [tmpq+strideq*1-16]
+ mova xm6, [tmpq+strideq*2-16]
+ mova xm7, [tmpq+stride3q -16]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0-16], 1
+ vinserti128 m1, m1, [tmpq+strideq*1-16], 1
+ vinserti128 m2, m2, [tmpq+strideq*2-16], 1
+ vinserti128 m3, m3, [tmpq+stride3q -16], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, m4, [tmpq+strideq*0-16], 1
+ vinserti128 m5, m5, [tmpq+strideq*1-16], 1
+ vinserti128 m6, m6, [tmpq+strideq*2-16], 1
+ vinserti128 m7, m7, [tmpq+stride3q -16], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+ mova [rsp+6*32], m0
+ mova [rsp+7*32], m1
+ mova [rsp+8*32], m2
+ mova [rsp+9*32], m3
+ mova [rsp+5*32], m4
+
+ mova xm0, [dstq+strideq*0]
+ mova xm1, [dstq+strideq*1]
+ mova xm2, [dstq+strideq*2]
+ mova xm3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova xm8, [tmpq+strideq*0]
+ mova xm9, [tmpq+strideq*1]
+ mova xm10, [tmpq+strideq*2]
+ mova xm11, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0], 1
+ vinserti128 m1, m1, [tmpq+strideq*1], 1
+ vinserti128 m2, m2, [tmpq+strideq*2], 1
+ vinserti128 m3, m3, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, m8, [tmpq+strideq*0], 1
+ vinserti128 m9, m9, [tmpq+strideq*1], 1
+ vinserti128 m10, m10, [tmpq+strideq*2], 1
+ vinserti128 m11, m11, [tmpq+stride3q ], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+
+ mova [rsp+10*32], m8
+ mova [rsp+11*32], m9
+ mova [rsp+12*32], m10
+ mova [rsp+13*32], m11
+
+ ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15
+ SWAP 13, 5, 0
+ SWAP 3, 6, 1, 15
+ SWAP 4, 7
+ SWAP 2, 14
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else
+ vpbroadcastq m0, [lq] ; l0, l1
+ vpbroadcastq m1, [lq+l_strideq] ; l2, l3
+ vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5
+ vpbroadcastq m10, [lq+l_stride3q] ; l6, l7
+ punpckldq m0, m1 ; l0, l2, l1, l3 [2x]
+ punpckldq m2, m10 ; l4, l6, l5, l7 [2x]
+ vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2, l4, l6
+ punpckhbw m0, m2 ; l1, l3, l5, l7
+%endif
+ pcmpeqw m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqw m10, m2, m0 ; !L
+ psrlw m10, 1
+ psrlw m2, m0, [lutq+128]
+ vpbroadcastw m1, [lutq+136]
+ pminuw m2, m1
+ pmaxuw m2, [pw_1] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [pw_2]
+ vpbroadcastd m8, [r11]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, m8}, m0, m1, m2
+
+ psubw m8, m3, m4 ; p1-p0
+ psubw m9, m5, m6 ; q1-q0
+ REPX {pabsw x, x}, m8, m9
+ pmaxuw m8, m10
+ pmaxuw m8, m9
+ pcmpgtw m7, m8, m1 ; hev
+%if %1 != 4
+ psubw m9, m13, m4 ; p2-p0
+ pabsw m9, m9
+ pmaxuw m9, m8
+%if %1 != 6
+%ifidn %2, v
+ mova m11, [tmpq+strideq*0] ; p3
+%else
+ mova m11, [rsp+5*32] ; p3
+%endif
+ psubw m10, m11, m4 ; p3-p0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ psubw m10, m5, m14 ; q2-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%if %1 != 6
+ psubw m10, m5, m15 ; q3-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ vpbroadcastd m10, [r11]
+ pcmpgtw m9, m10 ; !flat8in
+
+ psubw m10, m13, m3 ; p2-p1
+ pabsw m10, m10
+%if %1 != 6
+ psubw m11, m13 ; p3-p2
+ pabsw m11, m11
+ pmaxuw m10, m11
+ psubw m11, m14, m15 ; q3-q2
+ pabsw m11, m11
+ pmaxuw m10, m11
+%endif
+ psubw m11, m14, m6 ; q2-q1
+ pabsw m11, m11
+ pmaxuw m10, m11
+
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxuw m8, m10
+%endif
+ pcmpgtw m8, m2
+
+ psubw m10, m3, m6 ; p1-q1
+ psubw m11, m4, m5 ; p0-q0
+ REPX {pabsw x, x}, m10, m11
+ paddw m11, m11
+ psrlw m10, 1
+ paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else
+ mova m0, [rsp+7*32]
+ mova m1, [rsp+8*32]
+ mova m2, [rsp+9*32]
+%endif
+ REPX {psubw x, m4}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxuw m1, m0
+ pmaxuw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m10, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+10*32]
+ mova m2, [rsp+11*32]
+ mova m10, [rsp+12*32]
+%endif
+ REPX {psubw x, m5}, m0, m2, m10
+ REPX {pabsw x, x}, m0, m2, m10
+ pmaxuw m0, m2
+ pmaxuw m1, m10
+ pmaxuw m1, m0
+ vpbroadcastd m0, [r11]
+ pcmpgtw m1, m0 ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, m12
+ pcmpeqd m10, m12
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, m12
+ pcmpeqd m2, m12
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+ vpbroadcastd m0, [r11+8*1] ; 511 or 2047
+ vpbroadcastd m2, [r11+8*2] ; -512 or -2048
+ psubw m10, m5, m4
+ paddw m11, m10, m10
+ paddw m11, m10
+ psubw m10, m3, m6 ; iclip_diff(p1-q1)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m8, m10 ; f&=fm
+ vpbroadcastd m10, [pw_4]
+ paddw m10, m8
+ paddw m8, [pw_3]
+ REPX {pminsw x, m0}, m10, m8
+ psraw m10, 3 ; f2
+ psraw m8, 3 ; f1
+ psubw m5, m10
+ paddw m4, m8
+
+ paddw m10, [pw_1]
+ psraw m10, 1 ; f=(f1+1)>>1
+ pandn m8, m7, m10 ; f&=!hev
+ paddw m3, m8
+ psubw m6, m8
+ pxor m8, m8
+ psubw m0, m2 ; 1023 or 4095
+ REPX {pminsw x, m0}, m3, m4, m5, m6
+ REPX {pmaxsw x, m8}, m3, m4, m5, m6
+
+%if %1 == 16
+
+; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2,7-8,10-11 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m11, [tmpq+strideq*4] ; p3
+%else
+ mova m0, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ mova m11, [rsp+5*32]
+%endif
+
+ mova [rsp+ 0*32], m9
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ paddw m8, m0, [pw_1]
+ psllw m8, 3 ; p6*8+8
+ paddw m10, m2, m7 ; p5+p4
+ psubw m8, m0
+ paddw m10, m10 ; (p5+p4)*2
+ paddw m8, m11 ; p6*7+p3
+ paddw m10, m13 ; (p5+p4)*2+p2
+ paddw m8, m3 ; p6*7+p3+p1
+ paddw m10, m4 ; (p5+p4)*2+p2+p0
+ paddw m8, m5 ; p6*7+p3+p1+q0
+ paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m10, m8, 4
+ vpblendvb m10, m2, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m10 ; p5
+%else
+ mova [rsp+8*32], m10
+%endif
+
+ ; sub p6*2, add p3/q1
+ paddw m8, m11
+ paddw m10, m0, m0
+ paddw m8, m6
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m7, m10, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m10 ; p4
+%else
+ mova [rsp+9*32], m10
+%endif
+
+ ; sub p6/p5, add p2/q2
+ psubw m8, m0
+ paddw m10, m13, m14
+ psubw m8, m2
+ paddw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m10 ; p3
+ lea tmpq, [dstq+strideq*4]
+%else
+ mova [rsp+5*32], m10
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m8, m3
+ paddw m10, m0, m7
+ paddw m8, m15
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m13, m10, m1
+ mova [rsp+1*32], m10 ; don't clobber p2/m13
+
+ ; sub p6/p3, add p0/q4
+ paddw m8, m4
+ paddw m10, m0, m11
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m3, m10, m1
+ mova [rsp+2*32], m10 ; don't clobber p1/m3
+
+ ; sub p6/p2, add q0/q5
+ paddw m8, m5
+ paddw m10, m0, m13
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m4, m10, m1
+ mova [rsp+3*32], m10 ; don't clobber p0/m4
+
+ ; sub p6/p1, add q1/q6
+ paddw m8, m6
+ paddw m10, m0, m3
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+12*32] ; q6
+%endif
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m5, m10, m1
+ mova [rsp+4*32], m10 ; don't clobber q0/m5
+
+ ; sub p5/p0, add q2/q6
+ paddw m8, m14
+ paddw m10, m2, m4
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m8, m15
+ paddw m10, m7, m5
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ paddw m10, m11, m6
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m15, m10, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m10 ; q3
+%else
+ mova [rsp+14*32], m10
+%endif
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ paddw m10, m13, m14
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+10*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m10 ; q4
+%else
+ mova [rsp+10*32], m10
+%endif
+
+ ; sub p1/q3, add q6*2
+ psubw m8, m3
+ paddw m0, m0
+ psubw m8, m15
+ paddw m8, m0
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1]
+%else
+ mova m9, [rsp+11*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+11*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+ mova m13, [rsp+1*32]
+ mova m3, [rsp+2*32]
+ mova m4, [rsp+3*32]
+ mova m5, [rsp+4*32]
+ SWAP 2, 6
+ SWAP 7, 14
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%else
+ mova m15, [rsp+14*32]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m7, [pw_4096]
+%ifidn %2, v
+ mova m0, [tmpq+strideq*0] ; p3
+%else
+ mova m0, [rsp+5*32] ; p3
+%endif
+ paddw m1, m0, m13 ; p3+p2
+ paddw m2, m3, m4 ; p1+p0
+ paddw m8, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m8, m5 ; 2*(p3+p2)+q0
+ paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m10, m2, m7
+
+ paddw m8, m3, m6
+ psubw m2, m1
+ paddw m2, m8
+ pmulhrsw m8, m2, m7
+
+ paddw m11, m0, m3
+ paddw m1, m4, m14
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m1, m2, m7
+
+ paddw m11, m0, m4
+ pblendvb m4, m1, m9
+ paddw m1, m5, m15
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m11, m2, m7
+
+ paddw m2, m6
+ paddw m2, m15
+ paddw m1, m13, m5
+ pblendvb m5, m11, m9
+ pblendvb m13, m10, m9
+ psubw m2, m1
+ pmulhrsw m1, m2, m7
+
+ psubw m2, m3
+ pblendvb m3, m8, m9
+ psubw m2, m6
+ pblendvb m6, m1, m9
+ paddw m1, m15, m14
+ paddw m2, m1
+ pmulhrsw m2, m7
+
+ pblendvb m14, m2, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m13 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+ mova [dstq+strideq*2], m14 ; q2
+%elif %1 == 8
+ TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1
+
+ ; write 8x16
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm3
+ movu [dstq+stride3q -8], xm4
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm5
+ movu [dstq+strideq*1-8], xm6
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m3, 1
+ vextracti128 [dstq+stride3q -8], m4, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m5, 1
+ vextracti128 [dstq+strideq*1-8], m6, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova m8, [rsp+6*32]
+ mova m1, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9
+
+ mova [dstq+strideq*0-16], xm8
+ mova [dstq+strideq*1-16], xm1
+ mova [dstq+strideq*2-16], xm2
+ mova [dstq+stride3q -16], xm7
+ lea tmpq, [dstq+strideq*4]
+ mova [tmpq+strideq*0-16], xm0
+ mova [tmpq+strideq*1-16], xm13
+ mova [tmpq+strideq*2-16], xm3
+ mova [tmpq+stride3q -16], xm4
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m8, 1
+ vextracti128 [tmpq+strideq*1-16], m1, 1
+ vextracti128 [tmpq+strideq*2-16], m2, 1
+ vextracti128 [tmpq+stride3q -16], m7, 1
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m0, 1
+ vextracti128 [tmpq+strideq*1-16], m13, 1
+ vextracti128 [tmpq+strideq*2-16], m3, 1
+ vextracti128 [tmpq+stride3q -16], m4, 1
+
+ mova m0, [rsp+10*32]
+ mova m1, [rsp+11*32]
+ mova m2, [rsp+12*32]
+ mova m3, [rsp+13*32]
+ TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4
+ mova [dstq+strideq*0], xm5
+ mova [dstq+strideq*1], xm6
+ mova [dstq+strideq*2], xm14
+ mova [dstq+stride3q ], xm15
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m5, 1
+ vextracti128 [dstq+strideq*1], m6, 1
+ vextracti128 [dstq+strideq*2], m14, 1
+ vextracti128 [dstq+stride3q ], m15, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m0, 1
+ vextracti128 [dstq+strideq*1], m1, 1
+ vextracti128 [dstq+strideq*2], m2, 1
+ vextracti128 [dstq+stride3q ], m3, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m7, [pw_4096]
+ paddw m8, m3, m4
+ paddw m8, m13 ; p2+p1+p0
+ paddw m11, m13, m5
+ paddw m8, m8
+ paddw m8, m11 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m8, m7
+
+ paddw m8, m5
+ paddw m11, m13, m13
+ paddw m8, m6
+ psubw m8, m11
+ pmulhrsw m10, m8, m7
+
+ paddw m8, m6
+ paddw m11, m13, m3
+ paddw m8, m14
+ psubw m8, m11
+ pmulhrsw m11, m8, m7
+
+ psubw m8, m3
+ paddw m14, m14
+ psubw m8, m4
+ paddw m8, m14
+ pmulhrsw m8, m7
+
+ pblendvb m3, m2, m9
+ pblendvb m4, m10, m9
+ pblendvb m5, m11, m9
+ pblendvb m6, m8, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call .v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx512.asm b/third_party/dav1d/src/x86/loopfilter16_avx512.asm
new file mode 100644
index 0000000000..b7bc3aa106
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx512.asm
@@ -0,0 +1,912 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+l_shuf_v: times 2 db 0, 32
+pw_1: times 2 dw 1
+ times 2 db 4, 36
+pw_3: times 2 dw 3
+ times 2 db 8, 40
+pw_4: times 2 dw 4
+ times 2 db 12, 44
+pw_16: times 2 dw 16
+ times 2 db 16, 48
+pw_4096: times 2 dw 4096
+ times 2 db 20, 52
+pw_16384: times 2 dw 16384
+ times 2 db 24, 56
+pw_32767: times 2 dw 32767
+ times 2 db 28, 60
+ times 2 dw 0
+filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128
+stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25
+l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1
+clip_max: dw 511, 511, 2047, 2047
+clip_min: dw -512, -512, -2048, -2048
+
+SECTION .text
+
+%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp
+ punpckhwd m%9, m%5, m%6
+ punpcklwd m%5, m%6
+ punpckhwd m%6, m%1, m%2
+ punpcklwd m%1, m%2
+ punpckhwd m%2, m%7, m%8
+ punpcklwd m%7, m%8
+ punpckhwd m%8, m%3, m%4
+ punpcklwd m%3, m%4
+ punpckhdq m%4, m%1, m%3
+ punpckldq m%1, m%3
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%5, m%7
+ punpckhdq m%7, m%6, m%8
+ punpckldq m%6, m%8
+ punpckldq m%8, m%9, m%2
+ punpckhdq m%9, m%2
+ punpckhqdq m%2, m%1, m%3
+ punpcklqdq m%1, m%3
+ punpcklqdq m%3, m%4, m%5
+ punpckhqdq m%4, m%5
+ punpcklqdq m%5, m%6, m%8
+ punpckhqdq m%6, m%8
+ punpckhqdq m%8, m%7, m%9
+ punpcklqdq m%7, m%9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%ifidn %2, v
+%if %1 == 16
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1 ]
+ mova m1, [tmpq+strideq*2 ] ; p5
+ mova m2, [tmpq+stride3q ] ; p4
+ mova m3, [tmpq+strideq*4 ] ; p3
+ mova m4, [tmpq+stride5q ] ; p2
+%elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 == 8
+ mova m3, [tmpq+strideq*0 ]
+%endif
+ mova m4, [tmpq+strideq*1 ]
+%endif
+ mova m5, [dstq+mstrideq*2] ; p1
+ mova m6, [dstq+mstrideq*1] ; p0
+ mova m7, [dstq+strideq*0 ] ; q0
+ mova m8, [dstq+strideq*1 ] ; q1
+%if %1 != 4
+ mova m9, [dstq+strideq*2 ] ; q2
+%endif
+%if %1 == 8 || %1 == 16
+ mova m10, [dstq+stride3q ] ; q3
+%endif
+%if %1 == 16
+ mova m11, [dstq+strideq*4 ] ; q4
+ mova m22, [dstq+stride5q ] ; q5
+ mova m23, [dstq+stride3q*2]
+%endif
+%else ; h
+%if %1 == 16
+ movu ym16, [dstq+strideq*0 -16]
+ movu ym17, [dstq+strideq*1 -16]
+ movu ym18, [dstq+strideq*2 -16]
+ movu ym19, [dstq+stride3q -16]
+ movu ym20, [dstq+strideq*4 -16]
+ movu ym22, [dstq+stride5q -16]
+ movu ym23, [dstq+stride3q*2-16]
+ movu ym28, [dstq+stride7q -16]
+ lea tmpq, [dstq+strideq*8 -16]
+ vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m10, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1
+ vinserti32x8 m22, m22, [tmpq+stride5q ], 1
+ vinserti32x8 m23, m23, [tmpq+stride3q*2], 1
+ vinserti32x8 m28, m28, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8]
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27
+ movu ym16, [tmpq+strideq*0 ]
+ movu ym17, [tmpq+strideq*1 ]
+ movu ym18, [tmpq+strideq*2 ]
+ movu ym19, [tmpq+stride3q ]
+ movu ym24, [tmpq+strideq*4 ]
+ movu ym25, [tmpq+stride5q ]
+ movu ym26, [tmpq+stride3q*2]
+ movu ym20, [tmpq+stride7q ]
+ lea tmpq, [tmpq+strideq*8]
+ vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m3, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1
+ vinserti32x8 m5, m25, [tmpq+stride5q ], 1
+ vinserti32x8 m6, m26, [tmpq+stride3q*2], 1
+ vinserti32x8 m20, m20, [tmpq+stride7q ], 1
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27
+ vshufi32x4 m27, m7, m0, q2020
+ vshufi32x4 m7, m0, q3131
+ vshufi32x4 m0, m8, m1, q2020
+ vshufi32x4 m8, m1, q3131
+ vshufi32x4 m1, m9, m2, q2020
+ vshufi32x4 m9, m2, q3131
+ vshufi32x4 m2, m10, m3, q2020
+ vshufi32x4 m10, m3, q3131
+ vshufi32x4 m3, m11, m4, q2020
+ vshufi32x4 m11, m4, q3131
+ vshufi32x4 m4, m22, m5, q2020
+ vshufi32x4 m22, m5, q3131
+ vshufi32x4 m5, m23, m6, q2020
+ vshufi32x4 m23, m6, q3131
+ vshufi32x4 m6, m28, m20, q2020
+ vshufi32x4 m28, m20, q3131
+%elif %1 == 6 || %1 == 8
+%if %1 == 8
+ sub dstq, 8
+ movu xm16, [dstq+strideq*0 ]
+ movu xm17, [dstq+strideq*1 ]
+ movu xm18, [dstq+strideq*2 ]
+ movu xm19, [dstq+stride3q ]
+ movu xm24, [dstq+strideq*4 ]
+ movu xm25, [dstq+stride5q ]
+ movu xm26, [dstq+stride3q*2]
+ movu xm27, [dstq+stride7q ]
+ lea tmpq, [dstq+strideq*8 ]
+ vinserti128 ym16, [tmpq+strideq*0 ], 1
+ vinserti128 ym17, [tmpq+strideq*1 ], 1
+ vinserti128 ym18, [tmpq+strideq*2 ], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ vinserti128 ym24, [tmpq+strideq*4 ], 1
+ vinserti128 ym25, [tmpq+stride5q ], 1
+ vinserti128 ym26, [tmpq+stride3q*2], 1
+ vinserti128 ym27, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2
+ vinserti32x4 m9, m25, [tmpq+stride5q ], 2
+ vinserti32x4 m3, m26, [tmpq+stride3q*2], 2
+ vinserti32x4 m4, m27, [tmpq+stride7q ], 2
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, [tmpq+strideq*0 ], 3
+ vinserti32x4 m8, [tmpq+strideq*1 ], 3
+ vinserti32x4 m5, [tmpq+strideq*2 ], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ vinserti32x4 m2, [tmpq+strideq*4 ], 3
+ vinserti32x4 m9, [tmpq+stride5q ], 3
+ vinserti32x4 m3, [tmpq+stride3q*2], 3
+ vinserti32x4 m4, [tmpq+stride7q ], 3
+%else ; %1 == 6
+ movu xm16, [dstq+strideq*0-8]
+ movu xm17, [dstq+strideq*1-8]
+ movu xm18, [dstq+strideq*2-8]
+ movu xm19, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4-8]
+ movu xm2, [tmpq+strideq*0]
+ movu xm9, [tmpq+strideq*1]
+ movu xm3, [tmpq+strideq*2]
+ movu xm4, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym16, [tmpq+strideq*0], 1
+ vinserti128 ym17, [tmpq+strideq*1], 1
+ vinserti128 ym18, [tmpq+strideq*2], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym2, [tmpq+strideq*0], 1
+ vinserti128 ym9, [tmpq+strideq*1], 1
+ vinserti128 ym3, [tmpq+strideq*2], 1
+ vinserti128 ym4, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, m16, [tmpq+strideq*0], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 2
+ vinserti32x4 m9, [tmpq+strideq*1], 2
+ vinserti32x4 m3, [tmpq+strideq*2], 2
+ vinserti32x4 m4, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, [tmpq+strideq*0], 3
+ vinserti32x4 m8, [tmpq+strideq*1], 3
+ vinserti32x4 m5, [tmpq+strideq*2], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 3
+ vinserti32x4 m9, [tmpq+strideq*1], 3
+ vinserti32x4 m3, [tmpq+strideq*2], 3
+ vinserti32x4 m4, [tmpq+stride3q ], 3
+%endif
+ punpcklwd m6, m10, m8
+ punpckhwd m10, m8
+ punpcklwd m8, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m2, m9
+ punpckhwd m2, m9
+ punpcklwd m9, m3, m4
+ punpckhwd m3, m4
+ punpckldq m4, m6, m8
+ punpckhdq m6, m8
+ punpckldq m8, m10, m5
+ punpckhdq m10, m5
+ punpckldq m5, m7, m9
+ punpckhdq m7, m9
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+%if %1 == 8
+ punpcklqdq m3, m4, m5
+%endif
+ punpckhqdq m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m9
+ punpckhqdq m8, m9
+ punpcklqdq m9, m10, m2
+%if %1 == 8
+ punpckhqdq m10, m2
+%endif
+%else ; %1 == 4
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdq m7{k1}, [dstq+ym12-4]
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpgatherdq m4{k2}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpgatherdq m5{k1}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ vpgatherdq m6{k2}, [tmpq+ym12]
+ punpcklwd m8, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m8, m7
+ punpckhwd m8, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m4
+ punpckhqdq m8, m4
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu ym16, [lq+l_strideq*1]
+ movsldup m17, [l_shuf_v]
+ vptestnmb k1, ym16, ym16
+ vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][]
+ vpermb m16, m17, m16 ; l[x][1]
+%else
+ movq xm16, [lq+l_strideq*0]
+ movq xm17, [lq+l_strideq*1]
+ vinserti128 ym16, [lq+l_strideq*2], 1
+ vinserti128 ym17, [lq+l_stride3q ], 1
+ lea tmpq, [lq+l_strideq*4]
+ vinserti32x4 m16, [tmpq+l_strideq*0], 2
+ vinserti32x4 m17, [tmpq+l_strideq*1], 2
+ vinserti32x4 m16, [tmpq+l_strideq*2], 3
+ vinserti32x4 m17, [tmpq+l_stride3q ], 3
+ punpcklqdq m16, m17
+ vbroadcasti32x4 m17, [l_shuf_h]
+ vptestnmb k1, m16, m16
+ vpalignr m16{k1}, m16, 12
+ pshufb m16, m17 ; l[x][1]
+%endif
+ vpbroadcastd m20, [pw_32767]
+ psubw m17, m5, m6 ; p1-p0
+ psubw m18, m7, m8 ; q1-q0
+ vptestmw k1, m16, m16 ; L
+ pabsw m17, m17
+ pabsw m18, m18
+ vpmaxuw m20{k1}, m17, m18
+ vpbroadcastw m17, [lutq+136]
+ psrlw m18, m16, [lutq+128]
+ vpbroadcastd m19, [pw_1]
+ pminuw m18, m17
+ psrlw m17, m16, 4 ; H
+ paddw m16, m16
+ pmaxuw m18, m19 ; I
+ vpaddd m16, [pw_4] {1to16}
+ paddw m16, m18 ; E
+ REPX {pmullw x, m13}, m17, m18, m16
+ vpcmpw k4, m20, m17, 6 ; hev
+%if %1 != 4
+ psubw m19, m4, m5 ; p2-p1
+ pabsw m19, m19
+%if %1 == 8 || %1 == 16
+ psubw m17, m3, m4 ; p3-p2
+ pabsw m17, m17
+ pmaxuw m19, m17
+ psubw m17, m9, m10 ; q3-q2
+ pabsw m17, m17
+ pmaxuw m19, m17
+%endif
+ psubw m17, m9, m8 ; q2-q1
+ pabsw m17, m17
+ pmaxuw m19, m17
+%if %1 == 16
+ vpbroadcastd ym17, [maskq+4]
+ vpord ym17, [maskq+8] {1to8}
+ vptestmd k1, ym17, ym21
+%else
+ vptestmd k1, ym21, [maskq+4] {1to8}
+%endif
+ pmaxuw m19, m20
+ psubw m17, m4, m6 ; p2-p0
+ pabsw m17, m17
+ pmaxuw m17, m20
+ vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks
+%if %1 == 8 || %1 == 16
+ psubw m19, m3, m6 ; p3-p0
+ pabsw m19, m19
+ pmaxuw m17, m19
+ psubw m19, m7, m10 ; q3-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ psubw m19, m7, m9 ; q2-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ vpcmpw k1, m20, m18, 2
+ psubw m18, m5, m8 ; p1-q1
+ psubw m19, m6, m7 ; p0-q0
+ pabsw m18, m18
+ pabsw m19, m19
+ psrlw m18, 1
+ paddw m19, m19
+ paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+%if %1 != 4
+ vpcmpw k2{k1}, m17, m13, 2 ; flat8in
+%endif
+%if %1 == 16
+ psubw m20, m0, m6
+ psubw m16, m1, m6
+ pabsw m20, m20
+ psubw m17, m2, m6
+ pabsw m16, m16
+ psubw m18, m11, m7
+ pabsw m17, m17
+ psubw m19, m22, m7
+ pabsw m18, m18
+ pmaxuw m20, m16
+ psubw m16, m23, m7
+ pabsw m19, m19
+ pmaxuw m17, m18
+ pabsw m16, m16
+ vpandd ym18, ym21, [maskq+8] {1to8}
+ pmaxuw m20, m17
+ pmaxuw m19, m16
+ pcmpeqd ym16, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8
+ pmaxuw m20, m19
+ pcmpeqd ym17, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8
+ vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out
+ pcmpeqd ym18, ym21
+ vptestmb k3{k3}, ym16, ym16 ; flat8 & fm
+ vptestmb k2{k2}, ym17, ym17 ; flat8in
+ vptestmb k1{k1}, ym18, ym18
+ kandnd k1, k2, k1 ; fm & !flat8 & !flat16
+ kandnd k2, k3, k2 ; flat8 & !flat16
+%elif %1 == 6 || %1 == 8
+ vpandd ym17, ym21, [maskq+4] {1to8}
+ pcmpeqd ym16, ym21, ym17
+ vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8
+ pcmpeqd ym17, ym21
+ vptestmb k2{k2}, ym16, ym16 ; flat8 & fm
+ vptestmb k1{k1}, ym17, ym17
+ kandnd k1, k2, k1 ; fm & !flat8
+%else ; %1 == 4
+ vpandd ym16, ym21, [maskq+0] {1to8}
+ pcmpeqd ym16, ym21
+ vptestmb k1{k1}, ym16, ym16
+%endif
+
+ ; short filter
+ psubw m16, m7, m6
+ vpbroadcastd m17, [pw_3]
+ paddw m18, m16, m16
+ paddw m18, m16
+ psubw m16, m5, m8 ; iclip_diff(p1-q1)
+ pminsw m16, m14
+ vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev
+ knotd k4, k4 ; !hev
+ paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f)
+ vpbroadcastd m18, [pw_4]
+ pminsw m16, m14
+ vpmaxsw m16{k1}{z}, m15 ; f&=fm
+ paddw m17, m16
+ paddw m16, m18
+ vpbroadcastd m18, [pw_16384]
+ pminsw m17, m14
+ pminsw m16, m14
+ psraw m17, 3 ; f2
+ psraw m16, 3 ; f1
+ paddw m6, m17
+ psubw m7, m16
+ vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev
+ psubw m17, m14, m15 ; 1023 or 4095
+ pxor m18, m18
+ paddw m5, m16
+ psubw m8, m16
+ REPX {pminsw x, m17}, m6, m7, m5, m8
+ REPX {pmaxsw x, m18}, m6, m7, m5, m8
+
+%if %1 == 16 ; flat16 filter
+ vpaddd m19, m0, [pw_1] {1to16}
+ paddw m16, m1, m2 ; p5+p4
+ paddw m26, m1, m6 ; p5+p0
+ paddw m24, m2, m7 ; p4+q0
+ paddw m16, m4 ; p5+p4+p3
+ paddw m17, m3, m5 ; p2+p1
+ psllw m19, 3
+ paddw m16, m26 ; p5*2+p4+p3+p0
+ paddw m17, m24 ; p4+p2+p1+q0
+ psubw m19, m0 ; p6*7+8
+ paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0
+ paddw m18, m3, m8
+ paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0
+ paddw m25, m1, m0
+ paddw m16, m0, m0
+ psrlw m1{k3}, m19, 4
+ paddw m19, m18
+ psubw m19, m16 ; +p3+q1-p6*2
+ paddw m16, m2, m0
+ psrlw m2{k3}, m19, 4
+ psubw m19, m25
+ paddw m25, m4, m9
+ paddw m20, m10, m5
+ paddw m19, m25 ; +p2+q2-p6-p5
+ paddw m17, m0, m3
+ psubw m16, m20, m16
+ psrlw m3{k3}, m19, 4
+ paddw m19, m16 ; +p1+q3-p6-p4
+ paddw m16, m11, m6
+ psubw m16, m17
+ paddw m17, m0, m4
+ psrlw m4{k3}, m19, 4
+ paddw m19, m16 ; +p0+q4-p6-p3
+ paddw m16, m22, m7
+ psubw m16, m17
+ paddw m17, m0, m5
+ psrlw m5{k3}, m19, 4
+ paddw m19, m16 ; +q0+q5-p6-p2
+ paddw m16, m23, m8
+ psrlw m6{k3}, m19, 4
+ psubw m16, m17
+ paddw m19, m16 ; +q1+q6-p6-p1
+ paddw m16, m23, m9
+ psrlw m7{k3}, m19, 4
+ psubw m16, m26
+ paddw m19, m16 ; +q2+q6-p5-p0
+ paddw m16, m23, m10
+ psrlw m8{k3}, m19, 4
+ psubw m16, m24
+ paddw m19, m16 ; +q3+q6-p4-p0
+ paddw m16, m23, m11
+ psrlw m9{k3}, m19, 4
+ psubw m16, m18
+ paddw m19, m16 ; +q4+q6-p3-q1
+ paddw m16, m23, m22
+ psrlw m10{k3}, m19, 4
+ psubw m16, m25
+ paddw m19, m16 ; +q5+q6-p2-q2
+ paddw m16, m23, m23
+ psrlw m11{k3}, m19, 4
+ psubw m16, m20
+ paddw m19, m16 ; +q6*2-p1-q3
+ psrlw m22{k3}, m19, 4
+%endif
+%if %1 == 8 || %1 == 16 ; flat8 filter
+ vpbroadcastd m20, [pw_4096]
+ paddw m16, m3, m4 ; p3+p2
+ paddw m19, m5, m6 ; p1+p0
+ paddw m17, m16, m16 ; 2*(p3+p2)
+ paddw m19, m3 ; p1+p0+p3
+ paddw m17, m7 ; 2*(p3+p2)+q0
+ paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0
+ paddw m18, m4, m7
+ pmulhrsw m4{k2}, m19, m20
+ psubw m19, m16
+ paddw m17, m5, m8
+ paddw m16, m3, m5
+ paddw m19, m17
+ pmulhrsw m5{k2}, m19, m20
+ psubw m19, m16
+ paddw m16, m6, m9
+ paddw m19, m16
+ paddw m16, m3, m6
+ pmulhrsw m6{k2}, m19, m20
+ paddw m19, m10
+ psubw m16, m7, m16
+ paddw m19, m16
+ psubw m16, m10, m18
+ pmulhrsw m7{k2}, m19, m20
+ paddw m16, m8
+ paddw m19, m16
+ psubw m16, m10, m17
+ pmulhrsw m8{k2}, m19, m20
+ paddw m16, m9
+ paddw m19, m16
+ pmulhrsw m9{k2}, m19, m20
+%elif %1 == 6 ; flat6 filter
+ vpbroadcastd m10, [pw_4096]
+ paddw m2, m5, m6
+ paddw m0, m4, m7
+ paddw m1, m2, m4 ; p2+p1+p0
+ paddw m3, m4, m4
+ paddw m1, m1
+ paddw m4, m5
+ paddw m1, m0 ; p2+2*(p2+p1+p0)+q0
+ psubw m3, m7, m3
+ pmulhrsw m5{k2}, m1, m10
+ paddw m3, m8
+ psubw m4, m8, m4
+ paddw m1, m3
+ pmulhrsw m6{k2}, m1, m10
+ paddw m4, m9
+ paddw m9, m9
+ paddw m1, m4
+ pmulhrsw m7{k2}, m1, m10
+ psubw m9, m2
+ paddw m1, m9
+ pmulhrsw m8{k2}, m1, m10
+%endif
+
+%ifidn %2, v
+%if %1 == 16
+ mova [tmpq+strideq*2 ], m1 ; p5
+ mova [tmpq+stride3q ], m2 ; p4
+ mova [tmpq+strideq*4 ], m3 ; p3
+ mova [tmpq+stride5q ], m4 ; p2
+%elif %1 == 8
+ mova [tmpq+strideq*1 ], m4 ; p2
+%endif
+ mova [dstq+mstrideq*2], m5 ; p1
+ mova [dstq+mstrideq ], m6 ; p0
+ mova [dstq+strideq*0 ], m7 ; q0
+ mova [dstq+strideq*1 ], m8 ; q1
+%if %1 == 8 || %1 == 16
+ mova [dstq+strideq*2 ], m9 ; q2
+%endif
+%if %1 == 16
+ mova [dstq+stride3q ], m10 ; q3
+ mova [dstq+strideq*4 ], m11 ; q4
+ mova [dstq+stride5q ], m22 ; q5
+%endif
+%else
+%if %1 == 16
+ TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20
+ mova [dstq+strideq*0 -16], xm27
+ mova [dstq+strideq*0 ], xm7
+ mova [dstq+strideq*1 -16], xm0
+ mova [dstq+strideq*1 ], xm8
+ mova [dstq+strideq*2 -16], xm1
+ mova [dstq+strideq*2 ], xm9
+ mova [dstq+stride3q -16], xm2
+ mova [dstq+stride3q ], xm10
+ mova [dstq+strideq*4 -16], xm3
+ mova [dstq+strideq*4 ], xm11
+ mova [dstq+stride5q -16], xm4
+ mova [dstq+stride5q ], xm22
+ mova [dstq+stride3q*2-16], xm5
+ mova [dstq+stride3q*2 ], xm23
+ mova [dstq+stride7q -16], xm6
+ mova [dstq+stride7q ], xm28
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 -16], ym27, 1
+ vextracti128 [dstq+strideq*0 ], ym7, 1
+ vextracti128 [dstq+strideq*1 -16], ym0, 1
+ vextracti128 [dstq+strideq*1 ], ym8, 1
+ vextracti128 [dstq+strideq*2 -16], ym1, 1
+ vextracti128 [dstq+strideq*2 ], ym9, 1
+ vextracti128 [dstq+stride3q -16], ym2, 1
+ vextracti128 [dstq+stride3q ], ym10, 1
+ vextracti128 [dstq+strideq*4 -16], ym3, 1
+ vextracti128 [dstq+strideq*4 ], ym11, 1
+ vextracti128 [dstq+stride5q -16], ym4, 1
+ vextracti128 [dstq+stride5q ], ym22, 1
+ vextracti128 [dstq+stride3q*2-16], ym5, 1
+ vextracti128 [dstq+stride3q*2 ], ym23, 1
+ vextracti128 [dstq+stride7q -16], ym6, 1
+ vextracti128 [dstq+stride7q ], ym28, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 2
+ vextracti32x4 [dstq+strideq*0 ], m7, 2
+ vextracti32x4 [dstq+strideq*1 -16], m0, 2
+ vextracti32x4 [dstq+strideq*1 ], m8, 2
+ vextracti32x4 [dstq+strideq*2 -16], m1, 2
+ vextracti32x4 [dstq+strideq*2 ], m9, 2
+ vextracti32x4 [dstq+stride3q -16], m2, 2
+ vextracti32x4 [dstq+stride3q ], m10, 2
+ vextracti32x4 [dstq+strideq*4 -16], m3, 2
+ vextracti32x4 [dstq+strideq*4 ], m11, 2
+ vextracti32x4 [dstq+stride5q -16], m4, 2
+ vextracti32x4 [dstq+stride5q ], m22, 2
+ vextracti32x4 [dstq+stride3q*2-16], m5, 2
+ vextracti32x4 [dstq+stride3q*2 ], m23, 2
+ vextracti32x4 [dstq+stride7q -16], m6, 2
+ vextracti32x4 [dstq+stride7q ], m28, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 3
+ vextracti32x4 [dstq+strideq*0 ], m7, 3
+ vextracti32x4 [dstq+strideq*1 -16], m0, 3
+ vextracti32x4 [dstq+strideq*1 ], m8, 3
+ vextracti32x4 [dstq+strideq*2 -16], m1, 3
+ vextracti32x4 [dstq+strideq*2 ], m9, 3
+ vextracti32x4 [dstq+stride3q -16], m2, 3
+ vextracti32x4 [dstq+stride3q ], m10, 3
+ vextracti32x4 [dstq+strideq*4 -16], m3, 3
+ vextracti32x4 [dstq+strideq*4 ], m11, 3
+ vextracti32x4 [dstq+stride5q -16], m4, 3
+ vextracti32x4 [dstq+stride5q ], m22, 3
+ vextracti32x4 [dstq+stride3q*2-16], m5, 3
+ vextracti32x4 [dstq+stride3q*2 ], m23, 3
+ vextracti32x4 [dstq+stride7q -16], m6, 3
+ vextracti32x4 [dstq+stride7q ], m28, 3
+%elif %1 == 8
+ TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2
+ movu [dstq+strideq*0 ], xm3
+ movu [dstq+strideq*1 ], xm4
+ movu [dstq+strideq*2 ], xm5
+ movu [dstq+stride3q ], xm6
+ movu [dstq+strideq*4 ], xm7
+ movu [dstq+stride5q ], xm8
+ movu [dstq+stride3q*2], xm9
+ movu [dstq+stride7q ], xm10
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 ], ym3, 1
+ vextracti128 [dstq+strideq*1 ], ym4, 1
+ vextracti128 [dstq+strideq*2 ], ym5, 1
+ vextracti128 [dstq+stride3q ], ym6, 1
+ vextracti128 [dstq+strideq*4 ], ym7, 1
+ vextracti128 [dstq+stride5q ], ym8, 1
+ vextracti128 [dstq+stride3q*2], ym9, 1
+ vextracti128 [dstq+stride7q ], ym10, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 2
+ vextracti32x4 [dstq+strideq*1 ], m4, 2
+ vextracti32x4 [dstq+strideq*2 ], m5, 2
+ vextracti32x4 [dstq+stride3q ], m6, 2
+ vextracti32x4 [dstq+strideq*4 ], m7, 2
+ vextracti32x4 [dstq+stride5q ], m8, 2
+ vextracti32x4 [dstq+stride3q*2], m9, 2
+ vextracti32x4 [dstq+stride7q ], m10, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 3
+ vextracti32x4 [dstq+strideq*1 ], m4, 3
+ vextracti32x4 [dstq+strideq*2 ], m5, 3
+ vextracti32x4 [dstq+stride3q ], m6, 3
+ vextracti32x4 [dstq+strideq*4 ], m7, 3
+ vextracti32x4 [dstq+stride5q ], m8, 3
+ vextracti32x4 [dstq+stride3q*2], m9, 3
+ vextracti32x4 [dstq+stride7q ], m10, 3
+ lea dstq, [dstq+strideq*8+8]
+%else ; %1 == 4 || %1 == 6
+ punpcklwd m9, m5, m6
+ punpckhwd m5, m6
+ kxnorb k1, k1, k1
+ punpcklwd m6, m7, m8
+ punpckhwd m7, m8
+ kmovb k2, k1
+ punpckldq m8, m9, m6
+ vpscatterdq [dstq+ym12-4]{k1}, m8
+ punpckhdq m9, m6
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpscatterdq [tmpq+ym12]{k2}, m9
+ punpckldq m6, m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpscatterdq [tmpq+ym12]{k1}, m6
+ punpckhdq m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ vpscatterdq [tmpq+ym12]{k2}, m5
+%endif
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride, tmp, \
+ mask_bits, stride5
+%define base tmpq-filter_mask
+ SWAP 12, 26 ; avoids clobbering xmm10 on WIN64
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, v
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call .v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4: ; called by both luma and chroma
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, l_stride3, tmp, \
+ mask_bits, stride5, stride7
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride7q, [strideq+stride3q*2]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, h
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, h
+ jmp .end2
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+ call .h4
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+.end:
+ lea dstq, [dstq+strideq*8]
+.end2:
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4: ; called by both luma and chroma
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ shl l_strideq, 2
+ lea stride3q, [strideq*3]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride3q, [strideq*3]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, h
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4
+.end:
+ lea tmpq, [strideq+stride3q]
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea dstq, [dstq+tmpq*8]
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm
new file mode 100644
index 0000000000..c486b57a21
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm
@@ -0,0 +1,1793 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%if ARCH_X86_64
+%define PIC_sym(a) a
+%else
+%define PIC_base $$
+%define PIC_sym(a) pic_regq+a-PIC_base
+%endif
+
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_3: times 8 dw 3
+; 4 and 16 need to be next to each other since they are used as alternates
+; depending on whether bitdepth is 10 or 12
+pw_4: times 8 dw 4
+pw_16: times 8 dw 16
+pw_8: times 8 dw 8
+pw_4096: times 8 dw 4096
+
+pb_mask: dd 1, 1, 2, 2
+
+SECTION .text
+
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+%define extra_stack 2
+%else
+%define extra_stack 0
+%endif
+%endif
+
+%macro RELOC_ARGS 2 ; h/v, off
+ASSERT ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ mov r5d, [rstk + stack_offset + 4*4 + 4]
+%define lstridem [esp+%2+0*gprsize]
+ mov lstridem, r5d
+ mov r5d, [rstk + stack_offset + 4*5 + 4]
+%define lutm [esp+%2+1*gprsize]
+ mov lutm, r5d
+ mov r5d, [rstk + stack_offset + 4*6 + 4]
+%ifidn %1, v
+%define wm [esp+%2+2*gprsize]
+ mov wm, r5d
+ mov r5d, [rstk + stack_offset + 4*3 + 4]
+%define lm [esp+%2+3*gprsize]
+ mov lm, r5d
+%else ; %1 == h
+%define hm [esp+%2+2*gprsize]
+ mov hm, r5d
+%endif ; %1==v
+ mov r5d, r7m
+%define bdmulm [esp+%2+4*gprsize]
+ mov bdmulm, r5d
+%else
+%define lstridem r4m
+%define lutm r5m
+%ifidn %1, v
+%define wm r6m
+%define lm r3m
+%else
+%define hm r6m
+%endif
+%define bdmulm r7m
+%endif ; STACK_ALIGNMENT
+%endmacro
+
+%macro UNRELOC_ARGS 0
+%if ARCH_X86_32
+%undef lm
+%undef lstridem
+%undef wm
+%undef hm
+%undef lutm
+%endif
+%endmacro
+
+%macro SPLATD 2
+ movd %1, %2
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2
+ movd %1, %2
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; m%1 a b c d e f g h a i q y 6 E M U
+; m%2 i j k l m n o p b j r z 7 F N V
+; m%3 q r s t u v w x c k s 0 8 G O W
+; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; m%6 E F G H I J K L f n v 3 B J R Z
+; m%7 M N O P Q R S T g o w 4 C K S +
+; m%8 U V W X Y Z + = h p x 5 D L T =
+%if ARCH_X86_64
+%macro TRANSPOSE8X8W 9
+ ; m%1 a b c d e f g h a i q y b j r z
+ ; m%2 i j k l m n o p c k s 0 d l t 1
+ ; m%3 q r s t u v w x -> e m u 2 f n v 3
+ ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; m%6 E F G H I J K L 8 G O W 9 H P X
+ ; m%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; m%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; m%1 a i q y b j r z a i q y 6 E M U
+ ; m%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; m%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; m%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; m%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; m%7 A I Q Y B J R Z g o w 4 C K S +
+ ; m%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+%else ; x86-32
+; input: 1-7 in registers, 8 in first memory [read-only]
+; second memory is scratch, and may overlap with first or third memory
+; output: 1-5,7-8 in registers, 6 in third memory [write-only]
+%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x]
+ TRANSPOSE4X4W %1, %2, %3, %4, %8
+%ifnidn %9, ""
+ mov%12 m%8, %9
+%else
+ mova m%8, %10
+%endif
+ mova %10, m%4
+ TRANSPOSE4X4W %5, %6, %7, %8, %4
+ punpckhqdq m%4, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ mova m%7, %10
+%ifnidn %11, ""
+ mov%13 %11, m%6
+%else
+ mova %10, m%6
+%endif
+ punpckhqdq m%6, m%7, m%8
+ punpcklqdq m%7, m%8
+
+ ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8
+ SWAP %2, %4, %5, %3
+ SWAP %6, %8
+%endmacro
+%endif ; x86-32/64
+
+; transpose and write m8-11, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp
+ ; transpose 8x4
+ punpcklwd %5, %1, %2
+ punpckhwd %1, %2
+ punpcklwd %2, %3, %4
+ punpckhwd %3, %4
+ punpckldq %4, %5, %2
+ punpckhdq %5, %2
+ punpckldq %2, %1, %3
+ punpckhdq %1, %3
+
+ ; write out
+ movq [dstq+strideq*0-4], %4
+ movhps [dstq+strideq*1-4], %4
+ movq [dstq+strideq*2-4], %5
+ movhps [dstq+stride3q -4], %5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], %2
+ movhps [dstq+strideq*1-4], %2
+ movq [dstq+strideq*2-4], %1
+ movhps [dstq+stride3q -4], %1
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+%if ARCH_X86_64
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+ mova P1, [dstq+mstrideq*2] ; p1
+ mova P0, [dstq+mstrideq*1] ; p0
+ mova Q0, [dstq+strideq*0] ; q0
+ mova Q1, [dstq+strideq*1] ; q1
+%else ; x86-32
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%endif ; x86-32/64
+%else ; %1 != 4
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if ARCH_X86_64
+ ; we load p3 later
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq+strideq*0]
+ mova Q1, [dstq+strideq*1]
+ mova Q2, [dstq+strideq*2]
+%if %1 != 6
+%define P3 [tmpq+strideq*0]
+%define Q3 m15
+ mova Q3, [dstq+stride3q]
+%endif ; %1 != 6
+%else ; x86-32
+%define P2 [tmpq+strideq*1]
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%define Q2 [dstq+strideq*2]
+%if %1 != 6
+%define P3 [dstq+mstrideq*4]
+%define Q3 [dstq+stride3q]
+%endif ; %1 != 6
+%endif ; x86-32/64
+%endif ; %1 ==/!= 4
+%else ; %2 != v
+ ; load lines
+%if %1 == 4
+ movq m0, [dstq+strideq*0-4]
+ movq m2, [dstq+strideq*1-4]
+ movq m4, [dstq+strideq*2-4]
+ movq m5, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq m3, [tmpq+strideq*0-4]
+ movq m6, [tmpq+strideq*1-4]
+ movq m1, [tmpq+strideq*2-4]
+ movq m7, [tmpq+stride3q -4]
+
+ ; transpose 4x8
+ ; m0: A-D0
+ ; m2: A-D1
+ ; m4: A-D2
+ ; m5: A-D3
+ ; m3: A-D4
+ ; m6: A-D5
+ ; m1: A-D6
+ ; m7: A-D7
+ punpcklwd m0, m2
+ punpcklwd m4, m5
+ punpcklwd m3, m6
+ punpcklwd m1, m7
+ ; m0: A0-1,B0-1,C0-1,D0-1
+ ; m4: A2-3,B2-3,C2-3,D2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m1: A6-7,B6-7,C6-7,D6-7
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ ; m0: A0-3,B0-3
+ ; m2: C0-3,D0-3
+ ; m3: A4-7,B4-7
+ ; m4: C4-7,D4-7
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ punpckhqdq m3, m2, m4
+ punpcklqdq m2, m4
+ ; m0: A0-7
+ ; m1: B0-7
+ ; m2: C0-7
+ ; m3: D0-7
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%else
+%define P1 [esp+3*mmsize]
+%define P0 [esp+4*mmsize]
+%define Q0 [esp+5*mmsize]
+%define Q1 [esp+6*mmsize]
+ mova P1, m0
+ mova P0, m1
+ mova Q0, m2
+ mova Q1, m3
+%endif
+%elif %1 == 6 || %1 == 8
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+%if ARCH_X86_64
+ movu m7, [tmpq+stride3q -8]
+%endif
+
+ ; transpose 8x16
+ ; m0: A-H0,A-H8
+ ; m1: A-H1,A-H9
+ ; m2: A-H2,A-H10
+ ; m3: A-H3,A-H11
+ ; m4: A-H4,A-H12
+ ; m5: A-H5,A-H13
+ ; m6: A-H6,A-H14
+ ; m7: A-H7,A-H15
+%if ARCH_X86_64
+ punpcklwd m8, m0, m1
+%else
+ punpcklwd m7, m0, m1
+%endif
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+%if ARCH_X86_64
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+%else
+ mova [rsp+3*16], m4
+ movu m4, [tmpq+stride3q -8]
+ punpcklwd m5, m6, m4
+ punpckhwd m6, m4
+%endif
+ ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32]
+ ; m0: E0-1,F0-1,G0-1,H0-1
+ ; m1: A2-3,B2-3,C2-3,D2-3
+ ; m2: E2-3,F2-3,G2-3,H2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32]
+ ; m5: A6-7,B6-7,C6-7,D6-7
+ ; m6: E6-7,F6-7,G6-7,H6-7
+%if ARCH_X86_64
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+%else
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+%endif
+ punpckldq m1, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+%if ARCH_X86_64
+ punpckldq m5, m4, m6
+ punpckhdq m4, m6
+%else
+ mova [rsp+4*16], m3
+ mova m3, [rsp+3*16]
+ punpckldq m5, m3, m6
+ punpckhdq m3, m6
+%endif
+ ; m7: A0-3,B0-3 [m4 on x86-32]
+ ; m8: C0-3,D0-3 [m7 on x86-32]
+ ; m1: E0-3,F0-3
+ ; m0: G0-3,H0-3
+ ; m2: A4-7,B4-7
+ ; m3: C4-7,D4-7 [r4 on x86-32]
+ ; m5: E4-7,F4-7
+ ; m4: G4-7,H4-7 [m3 on x86-32]
+%if ARCH_X86_64
+%if %1 != 6
+ punpcklqdq m6, m7, m2
+%endif
+ punpckhqdq m7, m2
+ punpcklqdq m2, m8, m3
+ punpckhqdq m8, m3
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+%if %1 != 6
+ punpckhqdq m5, m0, m4
+%endif
+ punpcklqdq m0, m4
+%if %1 == 8
+ mova [rsp+1*16], m6
+%define P3 [rsp+1*16]
+%endif
+ ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15
+ SWAP 7, 13
+ SWAP 8, 2, 9
+ SWAP 3, 10
+ SWAP 1, 11
+ SWAP 0, 14
+ SWAP 5, 15
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%if %1 == 8
+%define Q3 m15
+%endif
+%else ; x86-32
+%if %1 == 8
+%define P3 [rsp+ 6*16]
+ punpcklqdq m6, m4, m2
+ mova P3, m6
+%endif
+ mova m6, [rsp+4*16]
+ punpckhqdq m4, m2
+ punpcklqdq m2, m7, m6
+ punpckhqdq m7, m6
+ punpcklqdq m6, m1, m5
+ punpckhqdq m1, m5
+%if %1 == 8
+%define Q3 [rsp+24*16]
+ punpckhqdq m5, m0, m3
+ mova Q3, m5
+%endif
+ punpcklqdq m0, m3
+%if %1 == 8
+%define P2 [rsp+18*16]
+%define P1 [rsp+19*16]
+%define P0 [rsp+20*16]
+%define Q0 [rsp+21*16]
+%define Q1 [rsp+22*16]
+%define Q2 [rsp+23*16]
+%else
+%define P2 [rsp+3*16]
+%define P1 [rsp+4*16]
+%define P0 [rsp+5*16]
+%define Q0 [rsp+6*16]
+%define Q1 [rsp+7*16]
+%define Q2 [rsp+8*16]
+%endif
+ mova P2, m4
+ mova P1, m2
+ mova P0, m7
+ mova Q0, m6
+ mova Q1, m1
+ mova Q2, m0
+%endif ; x86-32/64
+%else ; %1 == 16
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova m0, [dstq+strideq*0-16]
+ mova m1, [dstq+strideq*1-16]
+ mova m2, [dstq+strideq*2-16]
+ mova m3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0-16]
+ mova m5, [tmpq+strideq*1-16]
+ mova m6, [tmpq+strideq*2-16]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q -16]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ SWAP 5, 13
+ SWAP 6, 8
+ SWAP 7, 9
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%else ; x86-32
+%define P2 [esp+18*16]
+%define P1 [esp+19*16]
+%define P0 [esp+20*16]
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q -16], P2, "", a, a
+ mova P1, m6
+ mova P0, m7
+%endif ; x86-32/64
+ mova [rsp+ 7*16], m0
+ mova [rsp+ 8*16], m1
+ mova [rsp+ 9*16], m2
+ mova [rsp+10*16], m3
+%define P3 [rsp+6*16]
+ mova P3, m4
+
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ mova m2, [dstq+strideq*2]
+ mova m3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0]
+ mova m5, [tmpq+strideq*1]
+ mova m6, [tmpq+strideq*2]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q ]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10
+ SWAP 0, 10
+ SWAP 1, 11
+ SWAP 2, 14
+ SWAP 3, 15
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%define Q3 m15
+%else ; x86-32
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q ], [rsp+12*16], "", a, a
+%define Q0 [esp+21*16]
+%define Q1 [esp+22*16]
+%define Q2 [esp+23*16]
+%define Q3 [esp+24*16]
+ mova Q0, m0
+ mova Q1, m1
+ mova Q2, m2
+ mova Q3, m3
+%endif ; x86-32/64
+
+ mova [rsp+11*16], m4
+%if ARCH_X86_64
+ mova [rsp+12*16], m5
+%endif
+ mova [rsp+13*16], m6
+ mova [rsp+14*16], m7
+%endif ; %1 == 4/6/8/16
+%endif ; %2 ==/!= v
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+%define l_strideq r5
+ mov l_strideq, dword lstridem
+%ifidn %2, v
+%define lq r3
+ mov lq, dword lm
+%endif
+%endif
+%ifidn %2, v
+%if cpuflag(sse4)
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else ; ssse3
+ movq m1, [lq]
+ movq m0, [lq+l_strideq]
+ pxor m2, m2
+ REPX {punpcklbw x, m2}, m1, m0
+%endif ; ssse3/sse4
+%else ; %2 != v
+ movq m0, [lq] ; l0, l1
+ movq m1, [lq+l_strideq] ; l2, l3
+ punpckldq m0, m1 ; l0, l2, l1, l3
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2
+ punpckhbw m0, m2 ; l1, l3
+%endif ; %2==/!=v
+%if ARCH_X86_32
+%ifidn %2, v
+%undef lq
+ mov mstrideq, mstridem
+%endif
+%endif
+ pcmpeqw m5, m2, m0
+ pand m1, m5
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1]
+ pcmpeqw m5, m2, m0 ; !L
+ psrlw m5, 1
+%if ARCH_X86_64
+ psrlw m2, m0, [lutq+128]
+ SPLATW m1, [lutq+136]
+%else ; x86-32
+ mov r5, lutm
+ psrlw m2, m0, [r5+128]
+ SPLATW m1, [r5+136]
+%endif ; x86-32/64
+ pminsw m2, m1
+ pmaxsw m2, [PIC_sym(pw_1)] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [PIC_sym(pw_2)]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, [bdmulq]}, m0, m1, m2
+%if ARCH_X86_32
+%undef l_strideq
+ lea stride3q, [strideq*3]
+%endif
+
+ psubw m3, P1, P0 ; p1-p0
+ psubw m4, Q0, Q1 ; q0-q1
+ REPX {pabsw x, x}, m3, m4
+ pmaxsw m3, m5
+ pmaxsw m3, m4
+ pcmpgtw m7, m3, m1 ; hev
+%if %1 != 4
+ psubw m4, P2, P0 ; p2-p0
+ pabsw m4, m4
+ pmaxsw m4, m3
+%if %1 != 6
+ mova m6, P3 ; p3
+ psubw m5, m6, P0 ; p3-p0
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ psubw m5, Q0, Q2 ; q0-q2
+ pabsw m5, m5
+ pmaxsw m4, m5
+%if %1 != 6
+ psubw m5, Q0, Q3 ; q0-q3
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ pcmpgtw m4, [bdmulq] ; !flat8in
+
+ psubw m5, P2, P1 ; p2-p1
+ pabsw m5, m5
+%if %1 != 6
+ psubw m6, P2 ; p3-p2
+ pabsw m6, m6
+ pmaxsw m5, m6
+ psubw m6, Q2, Q3 ; q2-q3
+ pabsw m6, m6
+ pmaxsw m5, m6
+%endif ; %1 != 6
+ psubw m6, Q2, Q1 ; q2-q1
+ pabsw m6, m6
+ pmaxsw m5, m6
+
+%if %1 == 16
+ SPLATD m6, [maskq+8]
+ SPLATD m1, [maskq+4]
+ por m6, m1
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6
+%else ; %1 != 16
+ SPLATD m6, [maskq+4]
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6 ; only apply fm-wide to wd>4 blocks
+%endif ; %1==/!=16
+ pmaxsw m3, m5
+%endif ; %1 != 4
+ pcmpgtw m3, m2
+
+ psubw m5, P1, Q1 ; p1-q1
+ psubw m6, P0, Q0 ; p0-q0
+ REPX {pabsw x, x}, m5, m6
+ paddw m6, m6
+ psrlw m5, 1
+ paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m3, m5
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m1, [rsp+ 9*16]
+ mova m2, [rsp+10*16]
+%endif ; %2==/!=v
+ REPX {psubw x, P0}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxsw m1, m0
+ pmaxsw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m5, [tmpq+strideq*2]
+%else ; %2 != v
+ mova m0, [rsp+11*16]
+ mova m2, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%endif ; %2==/!=v
+ REPX {psubw x, Q0}, m0, m2, m5
+ REPX {pabsw x, x}, m0, m2, m5
+ pmaxsw m0, m2
+ pmaxsw m1, m5
+ pmaxsw m1, m0
+ pcmpgtw m1, [bdmulq] ; !flat8out
+ por m1, m4 ; !flat8in | !flat8out
+ SPLATD m2, [maskq+8]
+ pand m5, m2, m12
+ pcmpeqd m5, m12
+ pandn m1, m5 ; flat16
+ pandn m5, m3, m1 ; flat16 & fm
+ SWAP 1, 5
+
+ SPLATD m5, [maskq+4]
+ por m5, m2
+ pand m2, m5, m12
+ pcmpeqd m2, m12
+ pandn m4, m2 ; flat8in
+ pandn m2, m3, m4
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m2, m5
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m3, m2
+ pandn m0, m4, m3 ; fm & !flat8 & !flat16
+ SWAP 0, 3
+ pandn m0, m1, m4 ; flat8 & !flat16
+ SWAP 0, 4
+%elif %1 != 4
+ SPLATD m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m4, m2
+ pandn m2, m3, m4 ; flat8 & fm
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0
+ pandn m0, m4, m3 ; fm & !flat8
+ SWAP 0, 3
+%else ; %1 == 4
+ SPLATD m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0 ; fm
+%endif ; %1==/!=4
+
+ ; short filter
+%if ARCH_X86_64
+ SPLATW m0, r7m
+%else
+ SPLATW m0, bdmulm
+%endif
+ pcmpeqw m2, m2
+ psrlw m0, 1 ; 511 or 2047
+ pxor m2, m0 ; -512 or -2048
+
+ psubw m5, Q0, P0 ; q0-p0
+ paddw m6, m5, m5
+ paddw m6, m5 ; 3*(q0-p0)
+ psubw m5, P1, Q1 ; iclip_diff(p1-q1)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m5, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m3, m5 ; f&=fm
+ paddw m5, m3, [PIC_sym(pw_3)]
+ paddw m3, [PIC_sym(pw_4)]
+ REPX {pminsw x, m0}, m5, m3
+ psraw m5, 3 ; f2
+ psraw m3, 3 ; f1
+ psubw m0, m2 ; 1023 or 4095
+ pxor m2, m2
+%if ARCH_X86_64
+ paddw P0, m5
+ psubw Q0, m3
+%else
+ paddw m5, P0
+ psubw m6, Q0, m3
+ REPX {pminsw x, m0}, m5, m6
+ REPX {pmaxsw x, m2}, m5, m6
+%endif
+
+ paddw m3, [PIC_sym(pw_1)]
+ psraw m3, 1 ; f=(f1+1)>>1
+ pandn m7, m3 ; f&=!hev
+ SWAP 7, 3
+%if ARCH_X86_64
+ paddw P1, m3
+ psubw Q1, m3
+ REPX {pminsw x, m0}, P1, P0, Q0, Q1
+ REPX {pmaxsw x, m2}, P1, P0, Q0, Q1
+%else
+ psubw m7, Q1, m3
+ paddw m3, P1
+ REPX {pminsw x, m0}, m7, m3
+ REPX {pmaxsw x, m2}, m7, m3
+%if %1 > 4
+ mova P1, m3
+ mova P0, m5
+ mova Q0, m6
+ mova Q1, m7
+%endif
+%endif
+
+%if %1 == 16
+
+; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2-3,5-7 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m6, [tmpq+strideq*4] ; p3
+ lea tmpq, [dstq+mstrideq*4]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m7, [rsp+10*16]
+ mova m6, [rsp+ 6*16]
+%endif ; %2==/!=v
+
+ mova [rsp+ 0*16], m4
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psllw m3, m0, 3 ; p6*8
+ paddw m3, [PIC_sym(pw_8)]
+ paddw m5, m2, m7 ; p5+p4
+ psubw m3, m0
+ paddw m5, m5 ; (p5+p4)*2
+ paddw m3, m6 ; p6*7+p3
+ paddw m5, P2 ; (p5+p4)*2+p2
+ paddw m3, P1 ; p6*7+p3+p1
+ paddw m5, P0 ; (p5+p4)*2+p2+p0
+ paddw m3, Q0 ; p6*7+p3+p1+q0
+ paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m2
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*2], m5 ; p5
+%else ; %2 != v
+ mova [rsp+9*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6*2, add p3/q1
+ paddw m3, m6
+ paddw m5, m0, m0
+ paddw m3, Q1
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m7
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*1], m5 ; p4
+%else ; %2 != v
+ mova [rsp+10*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6/p5, add p2/q2
+ psubw m3, m0
+ paddw m5, P2, Q2
+ psubw m3, m2
+ paddw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m6
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; p3
+%else ; %2 != v
+ mova [rsp+6*16], m5
+%endif ; %2==/!=v
+
+%define WRITE_IN_PLACE 0
+%ifidn %2, v
+%if ARCH_X86_64
+%define WRITE_IN_PLACE 1
+%endif
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m3, P1
+ paddw m5, m0, m7
+ paddw m3, Q3
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P2
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+strideq*1], m5
+%else
+ mova [rsp+1*16], m5 ; don't clobber p2/m13
+%endif
+
+ ; sub p6/p3, add p0/q4
+ paddw m3, P0
+ paddw m5, m0, m6
+%ifidn %2, v
+ paddw m3, [dstq+strideq*4]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P1
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*2], m5
+%else
+ mova [rsp+2*16], m5 ; don't clobber p1/m3
+%endif
+
+ ; sub p6/p2, add q0/q5
+ paddw m3, Q0
+ paddw m5, m0, P2
+%ifidn %2, v
+%if ARCH_X86_32
+ lea r4, P2
+%endif
+ lea tmpq, [dstq+strideq*4]
+ paddw m3, [tmpq+strideq*1]
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*1], m5
+%else
+ mova [rsp+3*16], m5 ; don't clobber p0/m4
+%endif
+
+ ; sub p6/p1, add q1/q6
+ paddw m3, Q1
+ paddw m5, m0, P1
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else ; %2 != v
+ mova m0, [rsp+13*16] ; q6
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq], m5
+%else
+ mova [rsp+4*16], m5 ; don't clobber q0/m5
+%endif
+
+ ; sub p5/p0, add q2/q6
+ paddw m3, Q2
+ paddw m5, m2, P0
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q1
+ por m2, m5, m4 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m3, Q3
+ paddw m7, Q0
+ paddw m3, m0
+ psubw m3, m7
+ psrlw m7, m3, 4
+ pand m7, m1
+ pandn m4, m1, Q2
+ por m7, m4 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*0]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ paddw m6, Q1
+ paddw m3, m0
+ psubw m3, m6
+ psrlw m6, m3, 4
+ pand m6, m1
+ pandn m4, m1, Q3
+ por m6, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+mstrideq], m6 ; q3
+%else ; %2 != v
+ mova [rsp+5*16], m6
+%endif ; %2==/!=v
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*1]
+%if ARCH_X86_64
+ paddw m5, P2, Q2
+%else
+ ; because tmpq is clobbered, so we use a backup pointer for P2 instead
+ paddw m5, [r4], Q2
+ mov pic_regq, pic_regm
+%endif
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+ paddw m5, P2, Q2
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*0]
+%else ; %2 != v
+ pandn m4, m1, [rsp+11*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; q4
+%else ; %2 != v
+ mova [rsp+11*16], m5
+%endif ; %2==/!=v
+
+ ; sub p1/q3, add q6*2
+ psubw m3, P1
+ paddw m0, m0
+ psubw m3, Q3
+ paddw m3, m0
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*1]
+%else ; %2 != v
+ pandn m4, m1, [rsp+12*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*1], m5 ; q5
+%else ; %2 != v
+ mova [rsp+12*16], m5
+%endif ; %2==/!=v
+
+ mova m4, [rsp+0*16]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%if ARCH_X86_64
+ SWAP 2, 11
+ SWAP 7, 14
+ SWAP 6, 15
+%else ; x86-32
+ mova Q1, m2
+ mova Q2, m7
+%endif ; x86-32/64
+%if WRITE_IN_PLACE
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq]
+%elif ARCH_X86_64
+ mova P2, [rsp+1*16]
+ mova P1, [rsp+2*16]
+ mova P0, [rsp+3*16]
+ mova Q0, [rsp+4*16]
+%else ; !WRITE_IN_PLACE & x86-32
+ mova m0, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m3, [rsp+4*16]
+ mova m7, [rsp+5*16]
+ mova P2, m0
+ mova P1, m1
+ mova P0, m2
+ mova Q0, m3
+ mova Q3, m7
+%endif ; WRITE_IN_PLACE / x86-32/64
+%undef WRITE_IN_PLACE
+%endif ; %1 == 16
+
+%if %1 >= 8
+
+ ; flat8 filter
+ mova m0, P3 ; p3
+ paddw m1, m0, P2 ; p3+p2
+ paddw m2, P1, P0 ; p1+p0
+ paddw m3, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m3, Q0 ; 2*(p3+p2)+q0
+ paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m7, m2, [PIC_sym(pw_4096)]
+ psubw m7, P2
+ pand m7, m4
+
+ paddw m3, P1, Q1 ; p1+q1
+ psubw m2, m1 ; 2*p3+p2+p1+p0+q0
+ paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1
+ pmulhrsw m3, m2, [PIC_sym(pw_4096)]
+ psubw m3, P1
+ pand m3, m4
+
+ paddw m5, m0, P1 ; p3+p1
+ paddw m6, P0, Q2 ; p0+q2
+ psubw m2, m5 ; p3+p2+p1+p0+q0+q1
+ paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2
+ pmulhrsw m5, m2, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m6, m0, P0 ; p3+p0
+ paddw m1, Q0, Q3 ; q0+q3
+ psubw m2, m6 ; p2+p1+p0+q0+q1+q2
+ paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3
+ pmulhrsw m6, m2, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3
+ paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3
+ paddw m1, P2, Q0 ; p2+q0
+ psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3
+ pmulhrsw m1, m2, [PIC_sym(pw_4096)]
+ psubw m1, Q1
+ pand m1, m4
+
+ psubw m2, P1 ; p0+q0+2*q1+q2+2*q3
+ psubw m2, Q1 ; p0+q0+q1+q2+2*q3
+ paddw m0, Q3, Q2 ; q3+q2
+ paddw m2, m0 ; p0+q0+q1+2*q2+3*q3
+ pmulhrsw m2, [PIC_sym(pw_4096)]
+ psubw m2, Q2
+ pand m2, m4
+
+ paddw m7, P2
+ paddw m3, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m1, Q1
+ paddw m2, Q2
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m7 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m1 ; q1
+ mova [dstq+strideq*2], m2 ; q2
+%else ; %2 != v
+ mova m0, P3
+
+%if %1 == 8
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ SWAP 4, 15
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8
+%else
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \
+ Q3, [tmpq+strideq*1-8], a, u
+%endif
+
+ ; write 8x8
+ movu [dstq+strideq*0-8], m0
+ movu [dstq+strideq*1-8], m7
+ movu [dstq+strideq*2-8], m3
+ movu [dstq+stride3q -8], m5
+ movu [tmpq+strideq*0-8], m6
+%if ARCH_X86_64
+ movu [tmpq+strideq*1-8], m1
+%endif
+ movu [tmpq+strideq*2-8], m2
+ movu [tmpq+stride3q -8], m4
+ lea dstq, [dstq+strideq*8]
+%else ; %1 != 8
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+%else
+ mova [rsp+1*16], m6
+ mova [rsp+2*16], m1
+ mova [rsp+3*16], m2
+%endif
+
+ mova m1, [rsp+ 7*16]
+ mova m2, [rsp+ 8*16]
+ mova m4, [rsp+ 9*16]
+ mova m6, [rsp+10*16]
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11
+%else
+ mova [rsp+7*16], m5
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \
+ [rsp+7*16], [tmpq+strideq*1-16], a, a
+%endif
+
+ mova [dstq+strideq*0-16], m1
+ mova [dstq+strideq*1-16], m2
+ mova [dstq+strideq*2-16], m4
+ mova [dstq+stride3q -16], m6
+ mova [tmpq+strideq*0-16], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1-16], m7
+%endif
+ mova [tmpq+strideq*2-16], m3
+ mova [tmpq+stride3q -16], m5
+
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 4, 15
+%else
+ mova m6, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m4, Q3
+%endif
+ mova m0, [rsp+11*16]
+ mova m3, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%if ARCH_X86_64
+ mova m7, [rsp+14*16]
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8
+%else
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \
+ [rsp+14*16], [tmpq+strideq*1], a, a
+%endif
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ mova [tmpq+strideq*0], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1], m3
+%endif
+ mova [tmpq+strideq*2], m5
+ mova [tmpq+stride3q ], m7
+ lea dstq, [dstq+strideq*8]
+%endif ; %1==/!=8
+%endif ; %2==/!=v
+%elif %1 == 6
+ ; flat6 filter
+ paddw m3, P1, P0 ; p1+p0
+ paddw m3, P2 ; p2+p1+p0
+ paddw m6, P2, Q0 ; p2+q0
+ paddw m3, m3 ; 2*(p2+p1+p0)
+ paddw m3, m6 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m3, [PIC_sym(pw_4096)]
+ psubw m2, P1
+ pand m2, m4
+
+ paddw m3, Q0 ; p2+2*(p2+p1+p0+q0)
+ paddw m6, P2, P2 ; 2*p2
+ paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1
+ psubw m3, m6 ; p2+2*(p1+p0+q0)+q1
+ pmulhrsw m5, m3, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m3, Q1 ; p2+2*(p1+p0+q0+q1)
+ paddw m6, P2, P1 ; p2+p1
+ paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2
+ psubw m3, m6 ; p1+2*(p0+q0+q1)+q2
+ pmulhrsw m6, m3, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ psubw m3, P1 ; 2*(p0+q0+q1)+q2
+%if ARCH_X86_64
+ paddw Q2, Q2 ; q2*2
+%else
+ mova m0, Q2
+ paddw m0, m0
+%endif
+ psubw m3, P0 ; p0+2*(q0+q1)+q2
+%if ARCH_X86_64
+ paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2
+%else
+ paddw m3, m0
+%endif
+ pmulhrsw m3, [PIC_sym(pw_4096)]
+ psubw m3, Q1
+ pand m3, m4
+
+ paddw m2, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m3, Q1
+
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m2 ; p1
+ mova [dstq+mstrideq*1], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m3 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0
+%endif ; %2==/!=v
+%else ; %1 == 4
+%if ARCH_X86_64
+%ifidn %2, v
+ mova [dstq+mstrideq*2], P1 ; p1
+ mova [dstq+mstrideq*1], P0 ; p0
+ mova [dstq+strideq*0], Q0 ; q0
+ mova [dstq+strideq*1], Q1 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0
+%endif ; %2==/!=v
+%else ; x86-32
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m3
+ mova [dstq+mstrideq*1], m5
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m7
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0
+%endif ; %2==/!=v
+%endif ; x86-32/64
+%endif ; %1
+%undef P3
+%undef P2
+%undef P1
+%undef P0
+%undef Q0
+%undef Q1
+%undef Q2
+%undef Q3
+%endmacro
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16 code
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r1-4 - p2-q0 post-filter16
+; r5 - p3
+; r6 - q3 post-filter16
+; r7 - GPRs [mask_bitsm, mstridem]
+; r8 - m12/pb_mask
+; r9 - bdmulq
+cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 10*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+%define pic_regm dword [esp+7*16+2*gprsize]
+ mov pic_regm, pic_regq
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+9*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mstridem dword [esp+7*16+1*gprsize]
+ mov mstridem, mstrideq
+%define mask_bitsm dword [esp+7*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+8*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16
+; r1-4 - p2-q0 post-filter16 backup
+; r5 - q3 post-filter16 backup
+; r6 - p3
+; r7-10 - p7-4
+; r11-14 - q4-7
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r15 - GPRs [mask_bitsm]
+; r16 - m12/pb_mask
+; r17 - bdmulq
+; r18-24 - p2-q3
+cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 25*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+17*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+15*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+16*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm, mstridem]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 3*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+%define mstridem dword [esp+1*gprsize]
+ mov mask_bitsm, 0x3
+ mov mstridem, mstrideq
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+; r3-8 - p2-q2
+cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 9*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
diff --git a/third_party/dav1d/src/x86/loopfilter_avx2.asm b/third_party/dav1d/src/x86/loopfilter_avx2.asm
new file mode 100644
index 0000000000..84696c758a
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx2.asm
@@ -0,0 +1,1569 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 16 db 7, 1
+pb_3_1: times 16 db 3, 1
+pb_2_1: times 16 db 2, 1
+pb_m1_0: times 16 db -1, 0
+pb_m1_1: times 16 db -1, 1
+pb_m1_2: times 16 db -1, 2
+pb_1: times 32 db 1
+pb_2: times 32 db 2
+pb_3: times 32 db 3
+pb_4: times 32 db 4
+pb_16: times 32 db 16
+pb_63: times 32 db 63
+pb_64: times 32 db 64
+pb_128: times 32 db 0x80
+pb_129: times 32 db 0x81
+pb_240: times 32 db 0xf0
+pb_248: times 32 db 0xf8
+pb_254: times 32 db 0xfe
+
+pw_2048: times 16 dw 2048
+pw_4096: times 16 dw 4096
+
+pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ ; transpose 16x4
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+
+ ; write out
+ movd [dstq+strideq*0-2], xm%4
+ pextrd [dstq+strideq*1-2], xm%4, 1
+ pextrd [dstq+strideq*2-2], xm%4, 2
+ pextrd [dstq+stride3q-2], xm%4, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%5
+ pextrd [dstq+strideq*1-2], xm%5, 1
+ pextrd [dstq+strideq*2-2], xm%5, 2
+ pextrd [dstq+stride3q-2], xm%5, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%2
+ pextrd [dstq+strideq*1-2], xm%2, 1
+ pextrd [dstq+strideq*2-2], xm%2, 2
+ pextrd [dstq+stride3q-2], xm%2, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%1
+ pextrd [dstq+strideq*1-2], xm%1, 1
+ pextrd [dstq+strideq*2-2], xm%1, 2
+ pextrd [dstq+stride3q-2], xm%1, 3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm%4, m%4, 1
+ vextracti128 xm%5, m%5, 1
+ vextracti128 xm%2, m%2, 1
+ vextracti128 xm%1, m%1, 1
+
+ movd [dstq+strideq*0-2], xm%4
+ pextrd [dstq+strideq*1-2], xm%4, 1
+ pextrd [dstq+strideq*2-2], xm%4, 2
+ pextrd [dstq+stride3q-2], xm%4, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%5
+ pextrd [dstq+strideq*1-2], xm%5, 1
+ pextrd [dstq+strideq*2-2], xm%5, 2
+ pextrd [dstq+stride3q-2], xm%5, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%2
+ pextrd [dstq+strideq*1-2], xm%2, 1
+ pextrd [dstq+strideq*2-2], xm%2, 2
+ pextrd [dstq+stride3q-2], xm%2, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%1
+ pextrd [dstq+strideq*1-2], xm%1, 1
+ pextrd [dstq+strideq*2-2], xm%1, 2
+ pextrd [dstq+stride3q-2], xm%1, 3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ mova %3, m15
+%endif
+
+ ; input in m0-15
+ punpcklbw m15, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m12, m13
+ punpckhbw m12, m13
+ mova m13, %3
+ mova %3, m12
+ punpcklbw m12, m14, m13
+ punpckhbw m13, m14, m13
+
+ ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
+ punpcklwd m14, m15, m1
+ punpckhwd m15, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ mova m12, %3
+ mova %3, m11
+ punpcklwd m11, m12, m13
+ punpckhwd m12, m13
+
+ ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m15, m3
+ punpckhdq m15, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m12
+ punpckhdq m8, m12
+ mova m12, %3
+ mova %3, m8
+ punpckldq m8, m7, m12
+ punpckhdq m7, m12
+
+ ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m12, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m15, m7
+ punpckhqdq m15, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m1, m9
+ punpckhqdq m1, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ mova m11, %3
+ mova %3, m12
+ punpcklqdq m12, m0, m11
+ punpckhqdq m0, m11
+%if %2 == 0
+ mova m11, %3
+%endif
+
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
+ SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
+ SWAP 3, 14, 12, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m12, [tmpq+strideq*0]
+%endif
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movd xm3, [dstq+strideq*0-2]
+ movd xm4, [dstq+strideq*1-2]
+ movd xm5, [dstq+strideq*2-2]
+ movd xm6, [dstq+stride3q -2]
+ lea tmpq, [dstq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 2
+ pinsrd xm4, [tmpq+strideq*1-2], 2
+ pinsrd xm5, [tmpq+strideq*2-2], 2
+ pinsrd xm6, [tmpq+stride3q -2], 2
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 1
+ pinsrd xm4, [tmpq+strideq*1-2], 1
+ pinsrd xm5, [tmpq+strideq*2-2], 1
+ pinsrd xm6, [tmpq+stride3q -2], 1
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 3
+ pinsrd xm4, [tmpq+strideq*1-2], 3
+ pinsrd xm5, [tmpq+strideq*2-2], 3
+ pinsrd xm6, [tmpq+stride3q -2], 3
+ lea tmpq, [tmpq+strideq*4]
+ movd xm12, [tmpq+strideq*0-2]
+ movd xm13, [tmpq+strideq*1-2]
+ movd xm14, [tmpq+strideq*2-2]
+ movd xm15, [tmpq+stride3q -2]
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 2
+ pinsrd xm13, [tmpq+strideq*1-2], 2
+ pinsrd xm14, [tmpq+strideq*2-2], 2
+ pinsrd xm15, [tmpq+stride3q -2], 2
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 1
+ pinsrd xm13, [tmpq+strideq*1-2], 1
+ pinsrd xm14, [tmpq+strideq*2-2], 1
+ pinsrd xm15, [tmpq+stride3q -2], 1
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 3
+ pinsrd xm13, [tmpq+strideq*1-2], 3
+ pinsrd xm14, [tmpq+strideq*2-2], 3
+ pinsrd xm15, [tmpq+stride3q -2], 3
+ vinserti128 m3, xm12, 1
+ vinserti128 m4, xm13, 1
+ vinserti128 m5, xm14, 1
+ vinserti128 m6, xm15, 1
+
+ ; transpose 4x16
+ ; xm3: A-D0,A-D8,A-D4,A-D12
+ ; xm4: A-D1,A-D9,A-D5,A-D13
+ ; xm5: A-D2,A-D10,A-D6,A-D14
+ ; xm6: A-D3,A-D11,A-D7,A-D15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+ ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+ ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+ ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+ punpcklwd m6, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ ; xm6: A0-3,B0-3,C0-3,D0-3
+ ; xm7: A8-11,B8-11,C8-11,D8-11
+ ; xm4: A4-7,B4-7,C4-7,D4-7
+ ; xm3: A12-15,B12-15,C12-15,D12-15
+ punpckldq m5, m6, m4
+ punpckhdq m6, m4
+ punpckldq m4, m7, m3
+ punpckhdq m7, m3
+ ; xm5: A0-7,B0-7
+ ; xm6: C0-7,D0-7
+ ; xm4: A8-15,B8-15
+ ; xm7: C8-15,D8-15
+ punpcklqdq m3, m5, m4
+ punpckhqdq m4, m5, m4
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ ; xm3: A0-15
+ ; xm5: B0-15
+ ; xm4: C0-15
+ ; xm6: D0-15
+%elif %1 == 6 || %1 == 8
+ movq xm3, [dstq+strideq*0-%1/2]
+ movq xm4, [dstq+strideq*1-%1/2]
+ movq xm5, [dstq+strideq*2-%1/2]
+ movq xm6, [dstq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*8]
+ movhps xm3, [tmpq+strideq*0-%1/2]
+ movhps xm4, [tmpq+strideq*1-%1/2]
+ movhps xm5, [tmpq+strideq*2-%1/2]
+ movhps xm6, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movq xm7, [tmpq+strideq*0-%1/2]
+ movq xm8, [tmpq+strideq*1-%1/2]
+ movq xm9, [tmpq+strideq*2-%1/2]
+ movq xm11, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm7, [tmpq+strideq*0-%1/2]
+ movhps xm8, [tmpq+strideq*1-%1/2]
+ movhps xm9, [tmpq+strideq*2-%1/2]
+ movhps xm11, [tmpq+stride3q -%1/2]
+ vinserti128 m3, xm7, 1
+ vinserti128 m4, xm8, 1
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm11, 1
+ lea tmpq, [dstq+strideq*4]
+ movq xm12, [tmpq+strideq*0-%1/2]
+ movq xm13, [tmpq+strideq*1-%1/2]
+ movq xm14, [tmpq+strideq*2-%1/2]
+ movq xm15, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm12, [tmpq+strideq*0-%1/2]
+ movhps xm13, [tmpq+strideq*1-%1/2]
+ movhps xm14, [tmpq+strideq*2-%1/2]
+ movhps xm15, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movq xm7, [tmpq+strideq*0-%1/2]
+ movq xm8, [tmpq+strideq*1-%1/2]
+ movq xm9, [tmpq+strideq*2-%1/2]
+ movq xm11, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm7, [tmpq+strideq*0-%1/2]
+ movhps xm8, [tmpq+strideq*1-%1/2]
+ movhps xm9, [tmpq+strideq*2-%1/2]
+ movhps xm11, [tmpq+stride3q -%1/2]
+ vinserti128 m12, xm7, 1
+ vinserti128 m13, xm8, 1
+ vinserti128 m14, xm9, 1
+ vinserti128 m15, xm11, 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+%if %1 != 6
+ punpckhdq m7, m6
+%endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+%if %1 != 6
+ punpckhdq m12, m3, m12
+%endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+%if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m12, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm12: H0-15
+ SWAP 12, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+%else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+%endif
+%else
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm0, [dstq+strideq*0-8]
+ movu xm1, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm4, [tmpq+strideq*0-8]
+ movu xm5, [tmpq+strideq*1-8]
+ movu xm6, [tmpq+strideq*2-8]
+ movu xm7, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu xm8, [tmpq+strideq*0-8]
+ movu xm9, [tmpq+strideq*1-8]
+ movu xm10, [tmpq+strideq*2-8]
+ movu xm11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu xm12, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, [tmpq+strideq*0-8], 1
+ vinserti128 m1, [tmpq+strideq*1-8], 1
+ vinserti128 m2, [tmpq+strideq*2-8], 1
+ vinserti128 m3, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, [tmpq+strideq*0-8], 1
+ vinserti128 m5, [tmpq+strideq*1-8], 1
+ vinserti128 m6, [tmpq+strideq*2-8], 1
+ vinserti128 m7, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, [tmpq+strideq*0-8], 1
+ vinserti128 m9, [tmpq+strideq*1-8], 1
+ vinserti128 m10, [tmpq+strideq*2-8], 1
+ vinserti128 m11, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m12, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ TRANSPOSE_16X16B 0, 1, [rsp+11*32]
+ mova [rsp+12*32], m1
+ mova [rsp+13*32], m2
+ mova [rsp+14*32], m3
+ mova [rsp+15*32], m12
+ mova [rsp+16*32], m13
+ mova [rsp+17*32], m14
+ mova [rsp+18*32], m15
+ ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+ SWAP 12, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 15
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ movq xm1, [lq]
+ movq xm2, [lq+l_strideq*2]
+ movhps xm1, [lq+l_strideq]
+ movhps xm2, [lq+l_stride3q]
+ lea lq, [lq+l_strideq*4]
+ movq xm10, [lq]
+ movq xm0, [lq+l_strideq*2]
+ movhps xm10, [lq+l_strideq]
+ movhps xm0, [lq+l_stride3q]
+ lea lq, [lq+l_strideq*4]
+ vinserti128 m1, xm10, 1
+ vinserti128 m2, xm0, 1
+ shufps m0, m1, m2, q3131
+ shufps m1, m2, q2020
+%endif
+ pxor m2, m2
+ pcmpeqb m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqb m10, m2, m0 ; !L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, [pb_1] ; I
+ pand m1, m0, [pb_240]
+ psrlq m1, 4 ; H
+ paddb m0, [pb_2]
+ paddb m0, m0
+ paddb m0, m2 ; E
+ pxor m1, [pb_128]
+ pxor m2, [pb_128]
+ pxor m0, [pb_128]
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ pmaxub m8, m10
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+%if %1 == 4
+ pxor m8, [pb_128]
+ pcmpgtb m7, m8, m1 ; hev
+%else
+ pxor m7, m8, [pb_128]
+ pcmpgtb m7, m1 ; hev
+
+%if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+%else
+ ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+%endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+%if %1 != 6
+ ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+%endif
+ pxor m9, [pb_128]
+ pcmpgtb m9, [pb_129] ; !flat8in
+
+%if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+%else
+ ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+%endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, [pb_mask]
+ pcmpeqd m11, [pb_mask]
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, [pb_mask]
+ pcmpeqd m11, [pb_mask]
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxub m8, m10
+
+ pxor m8, [pb_128]
+%endif
+ pcmpgtb m8, m2
+
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ pand m10, [pb_254]
+ psrlq m10, 1
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pxor m10, [pb_128]
+ pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+12*32]
+%endif
+ ABSSUB m1, m0, m4, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+13*32]
+%endif
+ ABSSUB m2, m0, m4, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+stride3q]
+%else
+ mova m0, [rsp+14*32]
+%endif
+ ABSSUB m2, m0, m4, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+%else
+ mova m0, [rsp+15*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+16*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+17*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+ pxor m1, [pb_128]
+ pcmpgtb m1, [pb_129] ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, [pb_mask]
+ pcmpeqd m10, [pb_mask]
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, [pb_mask]
+ pcmpeqd m0, [pb_mask]
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, [pb_mask]
+ pcmpeqd m0, [pb_mask]
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+
+ pxor m3, [pb_128]
+ pxor m6, [pb_128]
+ psubsb m10, m3, m6 ; iclip_diff(p1-q1)
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, [pb_128]
+ pxor m5, [pb_128]
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pand m8, m10 ; f&=fm
+ paddsb m10, m8, [pb_3]
+ paddsb m8, [pb_4]
+ pand m10, [pb_248]
+ pand m8, [pb_248]
+ psrlq m10, 3
+ psrlq m8, 3
+ pxor m10, [pb_16]
+ pxor m8, [pb_16]
+ psubb m10, [pb_16] ; f2
+ psubb m8, [pb_16] ; f1
+ paddsb m4, m10
+ psubsb m5, m8
+ pxor m4, [pb_128]
+ pxor m5, [pb_128]
+
+ pxor m8, [pb_128]
+ pxor m10, m10
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, [pb_64]
+ pandn m8, m7, m8 ; f&=!hev
+ paddsb m3, m8
+ psubsb m6, m8
+ pxor m3, [pb_128]
+ pxor m6, [pb_128]
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+%else
+ mova m0, [rsp+12*32]
+ mova m2, [rsp+13*32]
+ mova m7, [rsp+14*32]
+%endif
+
+ mova [rsp+0*32], m9
+ mova [rsp+1*32], m14
+ mova [rsp+2*32], m15
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ punpcklbw m14, m0, m12
+ punpckhbw m15, m0, m12
+ pmaddubsw m10, m14, [pb_7_1]
+ pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, [pb_2]
+ pmaddubsw m9, [pb_2]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m9, [pb_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m9, [pb_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ pand m8, m1
+ pandn m9, m1, m2
+ por m8, m9
+%ifidn %2, v
+ mova [tmpq+strideq*2], m8 ; p5
+%else
+ mova [rsp+13*32], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, [pb_m1_1]
+ pmaddubsw m15, [pb_m1_1]
+ paddw m10, m14
+ paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m0, m6
+ punpckhbw m9, m0, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+3*32], m8
+ mova [rsp+4*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m7, m8, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m8 ; p4
+%else
+ mova [rsp+14*32], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ mova m14, [rsp+1*32]
+ punpcklbw m8, m0, m13
+ punpckhbw m9, m0, m13
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m2, [pb_m1_1]
+ mova [rsp+1*32], m8
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m12, m8, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m8 ; p3
+%else
+ mova [rsp+19*32], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ mova m15, [rsp+2*32]
+ punpcklbw m8, m0, m3
+ punpckhbw m9, m0, m3
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m15
+ punpckhbw m7, m15
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m7, [pb_m1_1]
+ mova [rsp+2*32], m8
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m13, m8, m1
+ mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m0, m4
+ punpckhbw m9, m0, m4
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0] ; q4
+%else
+ mova m9, [rsp+15*32]
+%endif
+ punpcklbw m8, m12, m9
+ punpckhbw m9, m12, m9
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+7*32], m8
+ mova [rsp+5*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m3, m8, m1
+ mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1] ; q5
+%else
+ mova m9, [rsp+16*32]
+%endif
+ punpcklbw m8, m0, m5
+ punpckhbw m0, m5
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m0, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m0, m13, m9
+ punpckhbw m9, m13, m9
+ mova m13, [rsp+6*32]
+ pmaddubsw m0, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+ 9*32], m0
+ mova [rsp+10*32], m9
+ paddw m10, m0
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ pmulhrsw m0, m10, [pw_2048]
+ pmulhrsw m8, m11, [pw_2048]
+ packuswb m0, m8
+ vpblendvb m0, m4, m0, m1
+ mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+17*32]
+%endif
+ paddw m10, [rsp+3*32]
+ paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ punpcklbw m8, m3, m0
+ punpckhbw m9, m3, m0
+ mova m3, [rsp+8*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+3*32], m8
+ mova [rsp+4*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m5, m8, m1
+ mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ paddw m10, [rsp+1*32]
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+ punpcklbw m8, m4, m0
+ punpckhbw m2, m4, m0
+ mova m4, [rsp+6*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m2, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m2, m9
+ vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, [rsp+2*32]
+ paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ punpcklbw m8, m5, m0
+ punpckhbw m9, m5, m0
+ mova m5, [rsp+8*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m7, m9
+ vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, [rsp+7*32]
+ paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ punpcklbw m8, m6, m0
+ punpckhbw m9, m6, m0
+ SWAP 2, 6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m15, m8, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m8 ; q3
+%else
+ mova [rsp+20*32], m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, [rsp+ 9*32]
+ paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m8, m14, m0
+ punpckhbw m9, m14, m0
+ SWAP 14, 7
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+15*32]
+%endif
+ vpblendvb m8, m9, m8, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m8 ; q4
+%else
+ mova [rsp+15*32], m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, [rsp+3*32]
+ paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m8, m15, m0
+ punpckhbw m9, m15, m0
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, [pw_2048]
+ pmulhrsw m11, [pw_2048]
+ packuswb m10, m11
+%ifidn %2, v
+ mova m11, [tmpq+strideq*1]
+%else
+ mova m11, [rsp+16*32]
+%endif
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+16*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%endif
+%if %1 >= 8
+ ; flat8 filter
+ punpcklbw m0, m12, m3
+ punpckhbw m1, m12, m3
+ pmaddubsw m2, m0, [pb_3_1]
+ pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, [pb_2_1]
+ pmaddubsw m11, [pb_2_1]
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, [pb_4]
+ punpckhbw m11, m5, [pb_4]
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m10, m13, m8, m9 ; p2
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; p2
+%endif
+
+ pmaddubsw m8, m0, [pb_m1_1]
+ pmaddubsw m11, m1, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m8, m3, m8, m9 ; p1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m8 ; p1
+%else
+ mova [rsp+0*32], m8
+%endif
+
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m8, m4, m8, m9 ; p0
+%ifidn %2, v
+ mova [tmpq+stride3q ], m8 ; p0
+%else
+ mova [rsp+1*32], m8
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m8, m0, [pb_1]
+ pmaddubsw m11, m1, [pb_1]
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m12
+ punpckhbw m11, m4, m12
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m11, m5, m8, m9 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11 ; q0
+%endif
+
+ pmaddubsw m0, [pb_m1_1]
+ pmaddubsw m1, [pb_m1_1]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m13, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendvb m13, m6, m8, m9 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13 ; q1
+%endif
+
+ punpcklbw m0, m3, m6
+ punpckhbw m1, m3, m6
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m14, m15
+ punpckhbw m1, m14, m15
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ paddw m2, m0
+ paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+ vpblendvb m2, m14, m2, m9 ; q2
+%ifidn %2, v
+ mova [dstq+strideq*2], m2 ; q2
+%else
+ mova m0, [rsp+0*32]
+ mova m1, [rsp+1*32]
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m12, m10
+ punpckhbw m12, m10
+ punpcklbw m10, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m15
+ punpckhbw m2, m15
+
+ punpcklwd m15, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m12, m0
+ punpckhwd m12, m0
+ punpcklwd m0, m1, m13
+ punpckhwd m1, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+
+ punpckldq m2, m15, m0
+ punpckhdq m15, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ punpckldq m1, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m12, m11
+ punpckhdq m12, m11
+
+ ; write 8x32
+ movq [dstq+strideq*0-4], xm2
+ movhps [dstq+strideq*1-4], xm2
+ movq [dstq+strideq*2-4], xm15
+ movhps [dstq+stride3q -4], xm15
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm10
+ movhps [dstq+stride3q -4], xm10
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm13
+ movhps [dstq+strideq*1-4], xm13
+ movq [dstq+strideq*2-4], xm12
+ movhps [dstq+stride3q -4], xm12
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm2, m2, 1
+ vextracti128 xm15, m15, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm3, m3, 1
+ vextracti128 xm1, m1, 1
+ vextracti128 xm10, m10, 1
+ vextracti128 xm13, m13, 1
+ vextracti128 xm12, m12, 1
+
+ movq [dstq+strideq*0-4], xm2
+ movhps [dstq+strideq*1-4], xm2
+ movq [dstq+strideq*2-4], xm15
+ movhps [dstq+stride3q -4], xm15
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm10
+ movhps [dstq+stride3q -4], xm10
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm13
+ movhps [dstq+strideq*1-4], xm13
+ movq [dstq+strideq*2-4], xm12
+ movhps [dstq+stride3q -4], xm12
+ lea dstq, [dstq+strideq*4]
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 0
+ SWAP 7, 1
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m0, [rsp+11*32]
+ mova m1, [rsp+12*32]
+ mova m2, [rsp+13*32]
+ mova m3, [rsp+14*32]
+ mova m4, [rsp+19*32]
+ mova m11, [rsp+20*32]
+ mova m12, [rsp+15*32]
+ mova m13, [rsp+16*32]
+ mova m14, [rsp+17*32]
+ TRANSPOSE_16X16B 1, 0, [rsp+18*32]
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm4
+ movu [dstq+strideq*1-8], xm5
+ movu [dstq+strideq*2-8], xm6
+ movu [dstq+stride3q -8], xm7
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm8
+ movu [dstq+strideq*1-8], xm9
+ movu [dstq+strideq*2-8], xm10
+ movu [dstq+stride3q -8], xm11
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm12
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m1, 1
+ vextracti128 [dstq+strideq*2-8], m2, 1
+ vextracti128 [dstq+stride3q -8], m3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m4, 1
+ vextracti128 [dstq+strideq*1-8], m5, 1
+ vextracti128 [dstq+strideq*2-8], m6, 1
+ vextracti128 [dstq+stride3q -8], m7, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m8, 1
+ vextracti128 [dstq+strideq*1-8], m9, 1
+ vextracti128 [dstq+strideq*2-8], m10, 1
+ vextracti128 [dstq+stride3q -8], m11, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m12, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%endif
+%elif %1 == 6
+ ; flat6 filter
+
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, [pb_3_1]
+ pmaddubsw m1, m11, [pb_3_1]
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, [pb_2]
+ pmaddubsw m12, m10, [pb_2]
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, [pw_4096]
+ pmulhrsw m12, m1, [pw_4096]
+ packuswb m2, m12
+ vpblendvb m2, m3, m2, m9
+%ifidn %2, v
+ mova [tmpq+strideq*2], m2 ; p1
+%endif
+
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, [pw_4096]
+ pmulhrsw m13, m1, [pw_4096]
+ packuswb m12, m13
+ vpblendvb m12, m4, m12, m9
+%ifidn %2, v
+ mova [tmpq+stride3q], m12 ; p0
+%endif
+
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, [pb_m1_1]
+ pmaddubsw m13, m11, [pb_m1_1]
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, [pw_4096]
+ pmulhrsw m13, m1, [pw_4096]
+ packuswb m14, m13
+ vpblendvb m14, m5, m14, m9
+%ifidn %2, v
+ mova [dstq+strideq*0], m14 ; q0
+%endif
+
+ pmaddubsw m8, [pb_m1_2]
+ pmaddubsw m11, [pb_m1_2]
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, [pb_m1_0]
+ pmaddubsw m10, [pb_m1_0]
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, [pw_4096]
+ pmulhrsw m1, [pw_4096]
+ packuswb m0, m1
+ vpblendvb m0, m6, m0, m9
+%ifidn %2, v
+ mova [dstq+strideq*1], m0 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+
+.loop:
+ cmp byte [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 32
+ add dstq, 32
+ add maskq, 1
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+
+.loop:
+ cmp byte [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ add maskq, 1
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+
+.loop:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4
+
+.end:
+ add lq, 32
+ add dstq, 32
+ add maskq, 1
+ sub wd, 8
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+
+.loop:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ add maskq, 1
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm
new file mode 100644
index 0000000000..0218b624d3
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm
@@ -0,0 +1,1534 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+
+pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080
+ dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000
+
+hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60
+hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51
+hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49
+hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+pb_1: times 4 db 1
+pb_2: times 4 db 2
+pb_3: times 4 db 3
+pb_4: times 4 db 4
+pb_16: times 4 db 16
+pb_63: times 4 db 63
+pb_64: times 4 db 64
+pb_128: times 4 db 0x80
+pb_240: times 4 db 0xf0
+pb_248: times 4 db 0xf8
+pb_254: times 4 db 0xfe
+pb_2_1: times 2 db 2, 1
+pb_3_1: times 2 db 3, 1
+pb_7_1: times 2 db 7, 1
+pb_m1_0: times 2 db -1, 0
+pb_m1_1: times 2 db -1, 1
+pb_m1_2: times 2 db -1, 2
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpscatterdd [dstq+m19-2]{k1}, m%4
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpscatterdd [t0 +m19-2]{k1}, m%5
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpscatterdd [t1 +m19-2]{k1}, m%2
+ kmovw k1, k6
+ vpscatterdd [t2 +m19-2]{k1}, m%1
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ SWAP m16, m22
+%endif
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
+%if %1 == 0
+ SWAP m13, m16
+%else
+ mova m13, %3
+%endif
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
+ punpckhbw m13, m14, m13
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ SWAP m11, m16
+%if %2 == 0
+ SWAP m16, m25
+%else
+ mova %3, m25
+%endif
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
+%if %2 == 0
+ SWAP m11, m16
+%endif
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%define is_h 0
+%if %1 == 4
+ lea t0, [dstq+mstrideq*2]
+ mova m3, [t0 +strideq*0] ; p1
+ mova m4, [t0 +strideq*1] ; p0
+ mova m5, [t0 +strideq*2] ; q0
+ mova m6, [t0 +stride3q ] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+%if %1 == 16
+ lea t0, [dstq+mstrideq*8]
+ mova m16, [t0 +strideq*1]
+ mova m17, [t0 +strideq*2]
+ mova m18, [t0 +stride3q ]
+%endif
+ lea t0, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m25, [t0 +strideq*0]
+%endif
+ mova m13, [t0 +strideq*1]
+ mova m3, [t0 +strideq*2]
+ mova m4, [t0 +stride3q ]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m22, [dstq+stride3q ]
+%endif
+%if %1 == 16
+ lea t0, [dstq+strideq*4]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
+%endif
+%endif
+%else ; h
+%define is_h 1
+ ; load lines
+%if %1 == 4
+ vbroadcasti32x4 m0, [hshuf4]
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpgatherdd m3{k1}, [dstq+m19-2]
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpgatherdd m4{k1}, [t0 +m19-2]
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpgatherdd m5{k1}, [t1 +m19-2]
+ kmovw k1, k6
+ vpgatherdd m6{k1}, [t2 +m19-2]
+ pshufb m3, m0
+ pshufb m4, m0
+ pshufb m5, m0
+ pshufb m6, m0
+ punpckldq m7, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpcklqdq m6, m7, m4
+ punpckhqdq m7, m4
+ punpcklqdq m4, m3, m5
+ punpckhqdq m3, m5
+ SWAP 3, 6
+ SWAP 5, 4, 7
+ ; 6,7,4,3 -> 3,4,5,6
+%elif %1 == 6 || %1 == 8
+ kmovb k1, k7
+ lea t0, [dstq+strideq*1]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
+ kmovb k1, k7
+ lea t1, [dstq+strideq*2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
+ kmovb k1, k7
+ lea t2, [dstq+stride3q ]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
+ kmovb k1, k7
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
+ kmovb k1, k7
+ vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m13{k1}, [t0 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m14{k1}, [t1 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m15{k1}, [t2 +ym0 -%1/2]
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+ %if %1 != 6
+ punpckhdq m7, m6
+ %endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+ %if %1 != 6
+ punpckhdq m12, m3, m12
+ %endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+ %if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m25, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
+ %else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+ %endif
+%else ; 16, h
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea t0, [dstq+strideq*4]
+ movu xm4, [t0 +strideq*0-8]
+ movu xm5, [t0 +strideq*1-8]
+ movu xm6, [t0 +strideq*2-8]
+ movu xm7, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm8, [t0 +strideq*0-8]
+ movu xm9, [t0 +strideq*1-8]
+ movu xm10, [t0 +strideq*2-8]
+ movu xm11, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm25, [t0 +strideq*0-8]
+ movu xm13, [t0 +strideq*1-8]
+ movu xm14, [t0 +strideq*2-8]
+ movu xm22, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
+ vinserti32x4 ym2, [t0 +strideq*2-8], 1
+ vinserti32x4 ym3, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym4, [t0 +strideq*0-8], 1
+ vinserti32x4 ym5, [t0 +strideq*1-8], 1
+ vinserti32x4 ym6, [t0 +strideq*2-8], 1
+ vinserti32x4 ym7, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym8, [t0 +strideq*0-8], 1
+ vinserti32x4 ym9, [t0 +strideq*1-8], 1
+ vinserti32x4 ym10, [t0 +strideq*2-8], 1
+ vinserti32x4 ym11, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
+ vinserti32x4 ym13, [t0 +strideq*1-8], 1
+ vinserti32x4 ym14, [t0 +strideq*2-8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
+ vinserti32x4 m2, [t0 +strideq*2-8], 2
+ vinserti32x4 m3, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 2
+ vinserti32x4 m5, [t0 +strideq*1-8], 2
+ vinserti32x4 m6, [t0 +strideq*2-8], 2
+ vinserti32x4 m7, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 2
+ vinserti32x4 m9, [t0 +strideq*1-8], 2
+ vinserti32x4 m10, [t0 +strideq*2-8], 2
+ vinserti32x4 m11, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
+ vinserti32x4 m13, [t0 +strideq*1-8], 2
+ vinserti32x4 m14, [t0 +strideq*2-8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
+ vinserti32x4 m2, [t0 +strideq*2-8], 3
+ vinserti32x4 m3, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 3
+ vinserti32x4 m5, [t0 +strideq*1-8], 3
+ vinserti32x4 m6, [t0 +strideq*2-8], 3
+ vinserti32x4 m7, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 3
+ vinserti32x4 m9, [t0 +strideq*1-8], 3
+ vinserti32x4 m10, [t0 +strideq*2-8], 3
+ vinserti32x4 m11, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
+ vinserti32x4 m13, [t0 +strideq*1-8], 3
+ vinserti32x4 m14, [t0 +strideq*2-8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
+ ;
+ TRANSPOSE_16X16B 0, 1, [rsp+0*64]
+ SWAP m16, m26
+ SWAP m17, m2
+ SWAP m18, m3
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 22
+%endif
+%endif
+
+ ; load L/E/I/H
+ vpbroadcastd m15, [pb_1]
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ kmovw k1, k6
+ vpgatherdd m0{k1}, [lq+m20+4]
+ kmovw k1, k6
+ vpgatherdd m1{k1}, [lq+m20+0]
+%endif
+ pxor m2, m2
+ pcmpeqb k1, m0, m2
+ vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, pbshuf ; l[x][0]
+ vpcmpub k3, m0, m2, 4 ; neq ; L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]{bcstd}
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, m15 ; I
+ pand m1, m0, [pb_240]{bcstd}
+ psrlq m1, 4 ; H
+ paddd m0, [pb_2]{bcstd}
+ paddb m0, m0
+ paddb m0, m2 ; E
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+ vpcmpub k1, m8, m1, 6 ; gt ; hev
+%if %1 != 4
+ %if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+ %else
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+ %endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+ %if %1 != 6
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+ %endif
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
+ %if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+ %else
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+ %endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+ %if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ por m11, [maskq+4]{bcstd}
+ %else
+ vpbroadcastd m11, [maskq+4]
+ %endif
+ vptestmd k4, m11, pbmask
+ vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
+ pmaxub m8, m10
+%endif
+ vpcmpub k3{k3}, m8, m2, 2 ; le
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ pand m10, [pb_254]{bcstd}
+ psrlq m10, 1
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+
+%if %1 == 16
+ ABSSUB m1, m16, m4, m2
+ ABSSUB m2, m17, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m18, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m29, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m30, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m31, m5, m10
+ pmaxub m1, m2
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
+ vpbroadcastd m2, [maskq+8]
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
+ por m10, m2, [maskq+4]{bcstd}
+ vptestmd k5, m10, pbmask
+ vpmovm2d m7, k5
+ vptestmb k2{k2}, m7, m7 ; flat8in
+ por m2, m10, [maskq+0]{bcstd}
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8 & !flat16
+ kandnq k2, k4, k2 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k2{k2}, m7, m7
+ kandq k2, k2, k3 ; flat8 & fm
+ por m0, [maskq+0]{bcstd}
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8
+%else
+ %ifidn %2, v
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
+ %else
+ vpbroadcastd m0, [maskq+0]
+ vptestmd k4, m0, pbmask
+ %endif
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7 ; fm
+%endif
+
+ ; short filter
+%if %1 >= 8
+ SWAP m23, m15
+%endif
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
+ pxor m3, pb128
+ pxor m6, pb128
+ psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, pb128
+ pxor m5, pb128
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
+ paddsb m8, m10, m15
+ paddsb m10, m0
+ pand m8, [pb_248]{bcstd}
+ pand m10, [pb_248]{bcstd}
+ psrlq m8, 3
+ psrlq m10, 3
+ pxor m8, m12
+ pxor m10, m12
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
+ paddsb m4, m8
+ psubsb m5, m10
+ pxor m4, pb128
+ pxor m5, pb128
+ ;
+ pxor m10, pb128
+ pxor m8, m8
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, m1
+ knotq k1, k1
+ paddsb m3{k1}, m3, m8
+ psubsb m6{k1}, m6, m8
+ pxor m3, pb128
+ pxor m6, pb128
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*8]
+%endif
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
+ SWAP m7, m18
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+%ifidn %2, h
+ vpbroadcastd m27, [pw_2048]
+ vpbroadcastd m1, [pb_m1_1]
+ %define pw2048 m27
+ %define pbm1_1 m1
+%endif
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5
+%else
+ vpblendmb m8{k4}, m2, m8
+ mova [rsp+1*64], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, pbm1_1
+ pmaddubsw m22, pbm1_1
+ paddw m10, m14
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ SWAP m18, m8
+ SWAP m23, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+stride3q]{k4}, m8 ; p4
+%else
+ vpblendmb m8{k4}, m7, m8
+ mova [rsp+2*64], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ SWAP m14, m16
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ SWAP m16, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
+%else
+ vpblendmb m8{k4}, m25, m8
+ mova [rsp+3*64], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m7, pbm1_1
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ SWAP m17, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea t0, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ SWAP m29, m8
+ SWAP m0, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, h
+ SWAP m28, m24
+ punpcklbw m8, m28, m5
+ punpckhbw m24, m28, m5
+%else
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
+%endif
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m24, pbm1_1
+ paddw m10, m8
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
+%ifidn %2, h
+ SWAP m27, m30
+%endif
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m24
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ SWAP m30, m24
+ SWAP m15, m9
+%ifidn %2, h
+ SWAP m9, m24
+ %define pw2048 m9
+%endif
+ pmulhrsw m24, m10, pw2048
+ pmulhrsw m8, m11, pw2048
+ paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
+ pmaddubsw m8, pbm1_1
+ paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m18, m8
+ pmulhrsw m8, m10, pw2048
+ paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m16, m9
+ %define pw2048 m16
+%endif
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
+ pmaddubsw m9, pbm1_1
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m23, m9
+ pmulhrsw m9, m11, pw2048
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m2, m1
+ %define pbm1_1 m2
+%endif
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+ SWAP m24, m31 ; q6
+ packuswb m8, m9
+%ifidn %2, h
+ SWAP m31, m2
+ %define pbm1_1 m31
+%endif
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m2, m9
+ vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ paddw m11, m7
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m7, m9
+ vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
+ SWAP 2, 6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+mstrideq]{k4}, m8
+%else
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
+%ifidn %2, h
+ SWAP m15, m8
+%endif
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
+ SWAP 14, 7
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
+%else
+ vpblendmb m17{k4}, m26, m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, pw2048
+ pmulhrsw m11, pw2048
+ packuswb m10, m11
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5
+%else
+ vmovdqu8 m27{k4}, m10
+%endif
+
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*4]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m9, [pb_3_1]
+ vpbroadcastd m10, [pb_2_1]
+%if %1 == 16
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
+%elifidn %2, h
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
+%endif
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, m10
+ pmaddubsw m11, m10
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+%if is_h || %1 == 16
+ vpblendmb m10{k2}, m13, m8 ; p2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [t0+strideq*1]{k2}, m8
+ %else
+ mova [t0+strideq*1], m10
+ %endif
+%endif
+
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m3, m8 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m8
+%else
+ SWAP m18, m8
+%endif
+
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m4, m8 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m8
+%else
+ SWAP m29, m8
+%endif
+
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m11{k2}, m5, m8 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%endif
+
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m13, pbm1_1
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendmb m13{k2}, m6, m8 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13
+%endif
+
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+%if is_h || %1 == 16
+ vpblendmb m2{k2}, m14, m2 ; q2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [dstq+strideq*2]{k2}, m2
+ %else
+ mova [dstq+strideq*2], m2
+ %endif
+%endif
+
+%ifidn %2, h
+ SWAP m24, m18
+ SWAP m26, m29
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
+ ;
+ punpcklwd m22, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+ ;
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
+ ; write 8x32
+ vpbroadcastd ym16, strided
+ pmulld ym16, [hmulD]
+ lea t1, [dstq+strideq*2]
+ lea t2, [dstq+strideq*4]
+ lea t3, [t1 +strideq*4]
+ lea t0, [dstq+strideq*8]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [dstq+ym16-4]{k1}, m2
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
+ vpscatterdq [t3 +ym16-4]{k4}, m3
+ lea t1, [t0+strideq*2]
+ lea t2, [t0+strideq*4]
+ lea t3, [t1+strideq*4]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [t0+ym16-4]{k1}, m26
+ vpscatterdq [t1+ym16-4]{k2}, m10
+ vpscatterdq [t2+ym16-4]{k3}, m13
+ vpscatterdq [t3+ym16-4]{k4}, m25
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 24
+ SWAP 7, 26
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
+ mova m2, [rsp+1*64]
+ mova m3, [rsp+2*64]
+ mova m4, [rsp+3*64]
+ SWAP m11, m16
+ SWAP m25, m17
+ SWAP m13, m27
+ SWAP m14, m30
+ TRANSPOSE_16X16B 1, 0, [rsp+4*64]
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea t0, [dstq+strideq*4]
+ movu [t0+strideq*0-8], xm4
+ movu [t0+strideq*1-8], xm5
+ movu [t0+strideq*2-8], xm6
+ movu [t0+stride3q -8], xm7
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm8
+ movu [t0+strideq*1-8], xm9
+ movu [t0+strideq*2-8], xm10
+ movu [t0+stride3q -8], xm11
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm25
+ movu [t0+strideq*1-8], xm13
+ movu [t0+strideq*2-8], xm14
+ movu [t0+stride3q -8], xm22
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
+ vextracti128 [t0+strideq*2-8], ym2, 1
+ vextracti128 [t0+stride3q -8], ym3, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym4, 1
+ vextracti128 [t0+strideq*1-8], ym5, 1
+ vextracti128 [t0+strideq*2-8], ym6, 1
+ vextracti128 [t0+stride3q -8], ym7, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym8, 1
+ vextracti128 [t0+strideq*1-8], ym9, 1
+ vextracti128 [t0+strideq*2-8], ym10, 1
+ vextracti128 [t0+stride3q -8], ym11, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym25, 1
+ vextracti128 [t0+strideq*1-8], ym13, 1
+ vextracti128 [t0+strideq*2-8], ym14, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
+ vextracti32x4 [t0+strideq*2-8], m2, 2
+ vextracti32x4 [t0+stride3q -8], m3, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 2
+ vextracti32x4 [t0+strideq*1-8], m5, 2
+ vextracti32x4 [t0+strideq*2-8], m6, 2
+ vextracti32x4 [t0+stride3q -8], m7, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 2
+ vextracti32x4 [t0+strideq*1-8], m9, 2
+ vextracti32x4 [t0+strideq*2-8], m10, 2
+ vextracti32x4 [t0+stride3q -8], m11, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 2
+ vextracti32x4 [t0+strideq*1-8], m13, 2
+ vextracti32x4 [t0+strideq*2-8], m14, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
+ vextracti32x4 [t0+strideq*2-8], m2, 3
+ vextracti32x4 [t0+stride3q -8], m3, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 3
+ vextracti32x4 [t0+strideq*1-8], m5, 3
+ vextracti32x4 [t0+strideq*2-8], m6, 3
+ vextracti32x4 [t0+stride3q -8], m7, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 3
+ vextracti32x4 [t0+strideq*1-8], m9, 3
+ vextracti32x4 [t0+strideq*2-8], m10, 3
+ vextracti32x4 [t0+stride3q -8], m11, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 3
+ vextracti32x4 [t0+strideq*1-8], m13, 3
+ vextracti32x4 [t0+strideq*2-8], m14, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
+%endif
+%endif
+
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m15, [pb_3_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, m15
+ pmaddubsw m1, m11, m15
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, m12
+ pmaddubsw m12, m10, m12
+%ifidn %2, h
+ vpbroadcastd m15, [pb_m1_1]
+ %define pbm1_1 m15
+%endif
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, m16
+ pmulhrsw m12, m1, m16
+ packuswb m2, m12
+ vpblendmb m2{k2}, m3, m2 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m2
+%endif
+
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m12, m13
+ vpblendmb m12{k2}, m4, m12 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m12
+%endif
+
+ vpbroadcastd m9, [pb_m1_2]
+ vpbroadcastd m4, [pb_m1_0]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, pbm1_1
+ pmaddubsw m13, m11, pbm1_1
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m14, m13
+ vpblendmb m14{k2}, m5, m14 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m14
+%endif
+
+ pmaddubsw m8, m9
+ pmaddubsw m11, m9
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, m4
+ pmaddubsw m10, m4
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, m16
+ pmulhrsw m1, m16
+ packuswb m0, m1
+ vpblendmb m0{k2}, m6, m0 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else ; %1 == 4
+%ifidn %2, v
+ mova [t0+strideq*0], m3 ; p1
+ mova [t0+strideq*1], m4 ; p0
+ mova [t0+strideq*2], m5 ; q0
+ mova [t0+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+%define k7 k6
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m28, [pb_m1_1]
+ vpbroadcastd m27, [pw_2048]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m28
+ %define pw2048 m27
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11, 12
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ kxnorw k6, k6, k6
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask [pb_mask]
+ %define pb128 [pb_128]{bcstd}
+ shl l_strideq, 1
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m17, [pb_m1_1]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m17
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+
+%undef k7
+cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11
+ mov r7d, 0xffff
+ movzx r8d, r7b
+ cmp hd, 9
+ cmovb r7d, r8d
+ kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff
+ shl l_strideq, 2
+ sub lq, 4
+ kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ mova m18, [pb_mask]
+ vpbroadcastd m17, [pb_128]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask m18
+ %define pb128 m17
+ add l_strideq, l_strideq
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter_sse.asm b/third_party/dav1d/src/x86/loopfilter_sse.asm
new file mode 100644
index 0000000000..cd0eb54702
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_sse.asm
@@ -0,0 +1,2348 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 8 db 7, 1
+pb_3_1: times 8 db 3, 1
+pb_2_1: times 8 db 2, 1
+pb_m1_0: times 8 db -1, 0
+pb_m1_1: times 8 db -1, 1
+pb_m1_2: times 8 db -1, 2
+pb_1: times 16 db 1
+pb_2: times 16 db 2
+pb_3: times 16 db 3
+pb_4: times 16 db 4
+pb_16: times 16 db 16
+pb_63: times 16 db 63
+pb_64: times 16 db 64
+pb_128: times 16 db 0x80
+pb_129: times 16 db 0x81
+pb_240: times 16 db 0xf0
+pb_248: times 16 db 0xf8
+pb_254: times 16 db 0xfe
+
+pw_2048: times 8 dw 2048
+pw_4096: times 8 dw 4096
+
+pd_mask: dd 1, 2, 4, 8
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x16 5
+ ; transpose 16x4
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+
+ ; write out
+%assign %%n 0
+%rep 4
+ movd [dstq+strideq *0-2], xm%4
+ movd [dstq+strideq *4-2], xm%5
+ movd [dstq+strideq *8-2], xm%2
+ movd [dstq+stride3q*4-2], xm%1
+ add dstq, strideq
+%if %%n < 3
+ psrldq xm%4, 4
+ psrldq xm%5, 4
+ psrldq xm%2, 4
+ psrldq xm%1, 4
+%endif
+%assign %%n (%%n+1)
+%endrep
+ lea dstq, [dstq+stride3q*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 2 ; output_transpose, mem
+%if %1 == 0
+ mova %2, m15 ; m7 in 32-bit
+%endif
+
+ ; input in m0-7
+ punpcklbw m15, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+%if ARCH_X86_64
+ SWAP 4, 5, 7
+%else
+ %if %1 == 0
+ mova m5, %2
+ %else
+ mova m5, [esp+1*16]
+ %endif
+ mova %2, m4
+%endif
+ punpcklbw m4, m6, m5
+ punpckhbw m6, m5
+
+ ; interleaved in m15,0,1,2,3,7,4,6
+ punpcklwd m5, m15, m1
+ punpckhwd m15, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+%if ARCH_X86_64
+ SWAP 3, 4, 7
+%else
+ mova m4, %2
+ mova %2, m3
+%endif
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+
+ ; interleaved in m5,15,1,0,2,7,3,4
+ punpckldq m6, m5, m2
+ punpckhdq m5, m2
+%if ARCH_X86_64
+ SWAP 2, 7, 5
+%else
+ mova m2, %2
+ mova [esp+1*16], m5
+%endif
+ punpckldq m5, m15, m2
+ punpckhdq m15, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m3, m0, m4
+ punpckhdq m0, m4
+
+%if ARCH_X86_32
+ mova [esp+0*16], m6
+ mova [esp+2*16], m5
+ mova [esp+3*16], m15
+ mova [esp+4*16], m2
+ mova [esp+5*16], m1
+ mova [esp+6*16], m3
+ mova [esp+7*16], m0
+ mova m8, [esp+ 8*16]
+ mova m9, [esp+ 9*16]
+ mova m10, [esp+10*16]
+ %if %1 == 0
+ mova m11, [esp+11*16]
+ mova m12, [esp+12*16]
+ mova m13, [esp+13*16]
+ mova m14, [esp+14*16]
+ %else
+ mova m11, [esp+20*16]
+ mova m12, [esp+15*16]
+ mova m13, [esp+16*16]
+ mova m14, [esp+17*16]
+ %endif
+%endif
+
+ ; input in m8-m15
+%if ARCH_X86_64
+ SWAP 7, 4
+%endif
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m12, m13
+ punpckhbw m12, m13
+%if ARCH_X86_64
+ mova m13, %2
+%else
+ %if %1 == 0
+ mova m13, [esp+15*16]
+ %else
+ mova m13, [esp+18*16]
+ %endif
+%endif
+ mova %2, m12
+ punpcklbw m12, m14, m13
+ punpckhbw m14, m14, m13
+
+ ; interleaved in m7,8,9,10,11,rsp%2,12,14
+ punpcklwd m13, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ mova m12, %2
+ mova %2, m11
+ punpcklwd m11, m12, m14
+ punpckhwd m12, m14
+
+ ; interleaved in m13,7,9,8,10,rsp%2,11,12
+ punpckldq m14, m13, m10
+ punpckhdq m13, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m12
+ punpckhdq m8, m12
+ mova m12, %2
+ mova %2, m8
+ punpckldq m8, m7, m12
+ punpckhdq m7, m12
+
+%if ARCH_X86_32
+ mova [esp+ 8*16], m10
+ mova [esp+ 9*16], m9
+ mova [esp+10*16], m11
+ SWAP 6, 1
+ SWAP 4, 2
+ SWAP 5, 3
+ mova m6, [esp+0*16]
+ mova m4, [esp+1*16]
+ mova m5, [esp+2*16]
+%endif
+
+ ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7
+ punpcklqdq m12, m6, m14
+ punpckhqdq m6, m14
+ punpcklqdq m14, m4, m13
+ punpckhqdq m4, m13
+ punpcklqdq m13, m5, m8
+ punpckhqdq m5, m8
+%if ARCH_X86_64
+ SWAP 8, 5
+%else
+ mova m8, [esp+3*16]
+ mova [esp+27*16], m5
+ %define m15 m8
+%endif
+ punpcklqdq m5, m15, m7
+ punpckhqdq m15, m7
+
+%if ARCH_X86_32
+ mova [esp+11*16], m12
+ mova [esp+12*16], m6
+ mova [esp+13*16], m14
+ mova [esp+14*16], m4
+ mova [esp+26*16], m13
+ mova [esp+ 0*16], m5
+ mova [esp+ 1*16], m15
+ mova m2, [esp+ 4*16]
+ mova m10, [esp+ 8*16]
+ mova m1, [esp+ 5*16]
+ mova m9, [esp+ 9*16]
+ mova m3, [esp+ 6*16]
+ mova m11, [esp+10*16]
+ mova m0, [esp+ 7*16]
+%endif
+
+ punpcklqdq m7, m2, m10
+ punpckhqdq m2, m10
+ punpcklqdq m10, m1, m9
+ punpckhqdq m1, m9
+ punpcklqdq m9, m3, m11
+ punpckhqdq m3, m11
+ mova m11, %2
+%if ARCH_X86_32
+ %define m12 m3
+%endif
+ mova %2, m12
+ punpcklqdq m12, m0, m11
+ punpckhqdq m0, m11
+%if %1 == 1
+ mova m11, %2
+%endif
+
+%if ARCH_X86_64
+ ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0
+ SWAP 0, 11, 1, 6, 5, 8, 7, 15
+ SWAP 2, 14, 12, 9
+ SWAP 3, 4, 13
+%else
+ %if %1 == 0
+ mova [esp+15*16], m9
+ mova [esp+17*16], m12
+ mova [esp+18*16], m0
+ mova [esp+28*16], m10
+ mova [esp+29*16], m1
+ mova m3, [esp+0*16]
+ mova m4, [esp+1*16]
+ SWAP m5, m7
+ SWAP m6, m2
+ %else
+ SWAP 0, 7
+ SWAP 3, 1, 2, 4, 6
+ %endif
+%endif
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%if ARCH_X86_64
+ %define %%flat8mem [rsp+0*16]
+ %define %%q2mem [rsp+1*16]
+ %define %%q3mem [rsp+2*16]
+%else
+ %if %1 == 4 || %1 == 6
+ %define %%p2mem [esp+ 8*16]
+ %define %%q2mem [esp+ 9*16]
+ %define %%flat8mem [esp+10*16]
+ %else
+ %ifidn %2, v
+ %define %%p2mem [esp+16*16]
+ %define %%q2mem [esp+ 1*16]
+ %define %%q3mem [esp+18*16]
+ %define %%flat8mem [esp+ 0*16]
+ %define %%flat16mem [esp+20*16]
+ %else
+ %define %%p2mem [esp+27*16]
+ %define %%q2mem [esp+28*16]
+ %define %%q3mem [esp+29*16]
+ %define %%flat8mem [esp+21*16]
+ %define %%flat16mem [esp+30*16]
+ %endif
+ %endif
+ %xdefine m12reg m12
+%endif
+
+%if ARCH_X86_32
+ lea stride3q, [strideq*3]
+%endif
+ ; load data
+%ifidn %2, v
+%if ARCH_X86_32
+ mov mstrideq, strideq
+ neg mstrideq
+%endif
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+%define %%p3mem [dstq+mstrideq*4]
+ %if ARCH_X86_32
+ %define m13 m0
+ %define m14 m1
+ %define m15 m2
+ %endif
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+ %if ARCH_X86_32
+ mova %%p2mem, m13
+ mova %%q2mem, m14
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %if %1 != 6
+ mova %%q3mem, m15
+ %define m15 %%q3mem
+ %endif
+ %endif
+%endif
+%else ; %2 == h
+ ; load lines
+%if %1 == 4
+ ; transpose 4x16
+ movd m7, [dstq+strideq*0-2]
+ movd m3, [dstq+strideq*1-2]
+ movd m4, [dstq+strideq*2-2]
+ movd m5, [dstq+stride3q -2]
+ lea tmpq, [dstq+strideq*4]
+ punpcklbw m7, m3
+ punpcklbw m4, m5
+ movd m3, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ movd m5, [tmpq+strideq*2-2]
+ movd m6, [tmpq+stride3q -2]
+ lea tmpq, [tmpq+strideq*4]
+ punpcklbw m3, m1
+ punpcklbw m5, m6
+ movd m0, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ punpcklbw m0, m1
+ movd m1, [tmpq+strideq*2-2]
+ movd m2, [tmpq+stride3q -2]
+ punpcklbw m1, m2
+ punpcklqdq m7, m0
+ punpcklqdq m4, m1
+ lea tmpq, [tmpq+strideq*4]
+ movd m0, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ punpcklbw m0, m1
+ movd m1, [tmpq+strideq*2-2]
+ movd m2, [tmpq+stride3q -2]
+ punpcklbw m1, m2
+ punpcklqdq m3, m0
+ punpcklqdq m5, m1
+ ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+ ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+ ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+ ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+ punpcklwd m6, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ ; xm6: A0-3,B0-3,C0-3,D0-3
+ ; xm7: A8-11,B8-11,C8-11,D8-11
+ ; xm4: A4-7,B4-7,C4-7,D4-7
+ ; xm3: A12-15,B12-15,C12-15,D12-15
+ punpckldq m5, m6, m4
+ punpckhdq m6, m4
+ punpckldq m4, m7, m3
+ punpckhdq m7, m3
+ ; xm5: A0-7,B0-7
+ ; xm6: C0-7,D0-7
+ ; xm4: A8-15,B8-15
+ ; xm7: C8-15,D8-15
+ punpcklqdq m3, m5, m4
+ punpckhqdq m5, m5, m4
+ punpcklqdq m4, m6, m7
+ punpckhqdq m6, m7
+ ; xm3: A0-15
+ ; xm5: B0-15
+ ; xm4: C0-15
+ ; xm6: D0-15
+ SWAP 4, 5
+%elif %1 == 6 || %1 == 8
+ ; transpose 8x16
+ movq m7, [dstq+strideq*0-%1/2]
+ movq m3, [dstq+strideq*1-%1/2]
+ movq m4, [dstq+strideq*2-%1/2]
+ movq m5, [dstq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*8]
+ punpcklbw m7, m3
+ punpcklbw m4, m5
+ movq m3, [tmpq+strideq*0-%1/2]
+ movq m1, [tmpq+strideq*1-%1/2]
+ movq m5, [tmpq+strideq*2-%1/2]
+ movq m6, [tmpq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*4]
+ punpcklbw m3, m1
+ punpcklbw m5, m6
+ movq m6, [tmpq+strideq*0-%1/2]
+ movq m0, [tmpq+strideq*1-%1/2]
+ movq m1, [tmpq+strideq*2-%1/2]
+ movq m2, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ punpcklbw m6, m0
+ punpcklbw m1, m2
+ movq m2, [tmpq+strideq*2-%1/2]
+ movq m0, [tmpq+stride3q -%1/2]
+ punpcklbw m2, m0
+%if ARCH_X86_64
+ SWAP m15, m2
+%else
+ %define m15 [esp+3*16]
+ mova m15, m2
+%endif
+ movq m0, [tmpq+strideq*0-%1/2]
+ movq m2, [tmpq+strideq*1-%1/2]
+ punpcklbw m0, m2
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m2, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m1
+ punpckhwd m6, m1
+ punpcklwd m1, m0, m15
+ punpckhwd m0, m15
+%if ARCH_X86_64
+ SWAP m15, m0
+%else
+ mova m15, m0
+%endif
+ ; xm2: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm1: A12-15,B12-15,C12-15,D12-15
+ ; xm0: E12-15,F12-15,G12-15,H12-15
+ punpckldq m0, m2, m5
+ punpckhdq m2, m5
+ punpckldq m5, m7, m6
+%if %1 != 6
+ punpckhdq m7, m6
+%endif
+ punpckldq m6, m4, m1
+ punpckhdq m4, m1
+ punpckldq m1, m3, m15
+%if %1 != 6
+ punpckhdq m3, m15
+ %if ARCH_X86_64
+ SWAP m15, m3
+ %else
+ mova m15, m3
+ %endif
+%endif
+ ; xm0: A0-7,B0-7
+ ; xm2: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm1: E8-15,F8-15
+ ; xm3: G8-15,H8-15
+ punpcklqdq m3, m0, m6
+ punpckhqdq m0, m6
+ punpckhqdq m6, m2, m4
+ punpcklqdq m2, m4
+ punpcklqdq m4, m5, m1
+ punpckhqdq m5, m1
+%if %1 == 8
+ punpcklqdq m1, m7, m15
+ punpckhqdq m7, m15
+ ; xm3: A0-15
+ ; xm0: B0-15
+ ; xm2: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm5: F0-15
+ ; xm1: G0-15
+ ; xm7: H0-15
+%if ARCH_X86_64
+ SWAP 11, 3, 2
+ SWAP 13, 0
+ SWAP 6, 5, 4
+ SWAP 14, 1
+ SWAP 15, 7
+ ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15
+ mova [rsp+21*16], m11
+ %define %%p3mem [rsp+21*16]
+%else
+ %define m11 [esp+26*16]
+ %define m13 [esp+27*16]
+ %define m14 [esp+28*16]
+ %define m15 [esp+29*16]
+ mova m11, m3
+ mova m13, m0
+ SWAP 3, 2
+ SWAP 6, 5, 4
+ mova m14, m1
+ mova m15, m7
+ %define %%p3mem [esp+26*16]
+%endif
+%else
+ %if ARCH_X86_64
+ SWAP 13, 3, 0
+ SWAP 14, 5, 6, 4, 2
+ ; 3,0,2,6,4,5 -> 13,3,4,5,6,14
+ %else
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ mova m13, m3
+ mova m14, m5
+ SWAP 3, 0
+ SWAP 5, 6, 4, 2
+ ; 0,2,6,4 -> 3,4,5,6
+ %endif
+%endif
+%else
+%if ARCH_X86_64
+ mova [rsp+20*16], m12
+%endif
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+%if ARCH_X86_32
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ lea tmpq, [dstq+strideq*8]
+ movu m8, [tmpq+strideq*0-8]
+ movu m9, [tmpq+strideq*1-8]
+ movu m10, [tmpq+strideq*2-8]
+ movu m11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu m12, [tmpq+strideq*0-8]
+ movu m13, [tmpq+strideq*1-8]
+ movu m14, [tmpq+strideq*2-8]
+ movu m15, [tmpq+stride3q -8]
+ mova [esp+ 8*16], m8
+ mova [esp+ 9*16], m9
+ mova [esp+10*16], m10
+ mova [esp+11*16], m11
+ mova [esp+12*16], m12
+ mova [esp+13*16], m13
+ mova [esp+14*16], m14
+ mova [esp+15*16], m15
+%endif
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+ movu m7, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+%if ARCH_X86_64
+ movu m8, [tmpq+strideq*0-8]
+ movu m9, [tmpq+strideq*1-8]
+ movu m10, [tmpq+strideq*2-8]
+ movu m11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu m12, [tmpq+strideq*0-8]
+ movu m13, [tmpq+strideq*1-8]
+ movu m14, [tmpq+strideq*2-8]
+ movu m15, [tmpq+stride3q -8]
+%endif
+
+%if ARCH_X86_64
+ TRANSPOSE_16X16B 0, [rsp+11*16]
+ mova [rsp+12*16], m1
+ mova [rsp+13*16], m2
+ mova [rsp+14*16], m3
+ mova [rsp+15*16], m12
+ mova [rsp+16*16], m13
+ mova [rsp+17*16], m14
+ mova [rsp+18*16], m15
+ ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+ SWAP 12, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 15
+ mova [rsp+21*16], m12
+ %define %%p3mem [rsp+21*16]
+ mova m12, [rsp+20*16]
+%else
+ TRANSPOSE_16X16B 0, [esp+16*16]
+ %define %%p3mem [esp+26*16]
+ %define m11 %%p3mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+%endif ; if 4 elif 6 or 8 else 16
+%endif ; if v else h
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+ mov l_strideq, l_stridem
+%endif
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ %if ARCH_X86_32
+ lea l_stride3q, [l_strideq*3]
+ %endif
+ movq xm1, [lq]
+ movq xm2, [lq+l_strideq*2]
+ movhps xm1, [lq+l_strideq]
+ movhps xm2, [lq+l_stride3q]
+ shufps m0, m1, m2, q3131
+ shufps m1, m2, q2020
+ %if ARCH_X86_32
+ lea stride3q, [strideq*3]
+ %endif
+%endif
+
+%if ARCH_X86_32
+ %ifidn %2, v
+ mov lutd, lutm
+ %endif
+%endif
+ pxor m2, m2
+ pcmpeqb m7, m2, m0
+ pand m1, m7
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1]
+ pcmpeqb m2, m0 ; !L
+ psrlq m7, m0, [lutq+128]
+ pand m7, [PIC_sym(pb_63)]
+ pminub m7, minlvl
+ pmaxub m7, [PIC_sym(pb_1)] ; I
+ pand m1, m0, [PIC_sym(pb_240)]
+ psrlq m1, 4 ; H
+ paddb m0, [PIC_sym(pb_2)]
+ paddb m0, m0
+ paddb m0, m7 ; E
+ pxor m1, [PIC_sym(pb_128)]
+ pxor m7, [PIC_sym(pb_128)]
+ pxor m0, [PIC_sym(pb_128)]
+ SWAP 2, 7
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 2, 10
+%else
+ %ifidn %2, v
+ mov mstrideq, strideq
+ neg mstrideq
+ %if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ %elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+ %endif
+ %endif
+ mova [esp+3*16], m0
+ mova [esp+4*16], m2
+%endif
+
+ ABSSUB m0, m3, m4, m2 ; abs(p1-p0)
+ pmaxub m0, m7
+ ABSSUB m2, m5, m6, m7 ; abs(q1-q0)
+ pmaxub m0, m2
+%if %1 == 4
+ pxor m0, [PIC_sym(pb_128)]
+ pcmpgtb m7, m0, m1 ; hev
+ %if ARCH_X86_64
+ SWAP 7, 11
+ %else
+ mova [esp+5*16], m7
+ %endif
+%else
+ pxor m7, m0, [PIC_sym(pb_128)]
+ pcmpgtb m7, m1 ; hev
+%if ARCH_X86_64
+ SWAP 7, 11
+%else
+ mova [esp+5*16], m7
+%endif
+
+%if %1 == 6
+ ABSSUB m1, m13, m4, m7 ; abs(p2-p0)
+ pmaxub m1, m0
+%else
+ mova m2, %%p3mem
+ ABSSUB m1, m2, m4, m7 ; abs(p3-p0)
+ pmaxub m1, m0
+ ABSSUB m7, m13, m4, m2 ; abs(p2-p0)
+ pmaxub m1, m7
+%endif
+ ABSSUB m7, m5, m14, m2 ; abs(p2-p0)
+ pmaxub m1, m7
+%if %1 != 6
+ ABSSUB m7, m5, m15, m2 ; abs(q3-q0)
+ pmaxub m1, m7
+%endif
+ pxor m1, [PIC_sym(pb_128)]
+ pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in
+%if ARCH_X86_64
+ SWAP 1, 9
+%else
+ mova [esp+6*16], m1
+%endif
+
+%if %1 == 6
+ ABSSUB m7, m13, m3, m1 ; abs(p2-p1)
+%else
+ mova m2, %%p3mem
+ ABSSUB m7, m2, m13, m1 ; abs(p3-p2)
+ ABSSUB m2, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m7, m2
+ ABSSUB m2, m14, m15, m1 ; abs(q3-q2)
+ pmaxub m7, m2
+%endif
+ ABSSUB m2, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m7, m2
+%if ARCH_X86_32
+ %define m12 m1
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+ pand m7, m2 ; only apply fm-wide to wd>4 blocks
+ pmaxub m0, m7
+
+ pxor m0, [PIC_sym(pb_128)]
+%endif ; %if %1 == 4 else
+%if ARCH_X86_64
+ SWAP 2, 10
+ pcmpgtb m0, m2
+%else
+ pcmpgtb m0, [esp+4*16]
+%endif
+
+ ABSSUB m1, m3, m6, m7 ; abs(p1-q1)
+ ABSSUB m7, m4, m5, m2 ; abs(p0-q0)
+ paddusb m7, m7
+ pand m1, [PIC_sym(pb_254)]
+ psrlq m1, 1
+ paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pxor m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+ pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+%else
+ pcmpgtb m1, [esp+3*16]
+%endif
+ por m0, m1
+
+%if %1 == 16
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova [esp+3*16], m0
+%endif
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+12*16]
+%endif
+ ABSSUB m1, m0, m4, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+13*16]
+%endif
+ ABSSUB m2, m0, m4, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+stride3q]
+%else
+ mova m0, [rsp+14*16]
+%endif
+ ABSSUB m2, m0, m4, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+%else
+ mova m0, [rsp+15*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+16*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+17*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+ pxor m1, [PIC_sym(pb_128)]
+ pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out
+%if ARCH_X86_64
+ por m1, m9 ; !flat8in | !flat8out
+%else
+ por m1, [esp+6*16]
+ %define m12 m7
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask2
+ pcmpeqd m2, m12
+ pandn m1, m2 ; flat16
+%if ARCH_X86_64
+ pandn m2, m8, m1 ; flat16 & fm
+%else
+ pandn m2, [esp+3*16], m1 ; flat16 & fm
+ mova %%flat16mem, m2
+%endif
+ SWAP 1, 2
+
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+%if ARCH_X86_64
+ pandn m9, m2 ; flat8in
+ pandn m2, m8, m9
+ SWAP 2, 9
+%else
+ pandn m0, [esp+6*16], m2
+ pandn m2, [esp+3*16], m0
+ mova [esp+6*16], m2
+%endif
+ pand m2, m12, mask0
+ pcmpeqd m2, m12
+%if ARCH_X86_64
+ pandn m8, m2
+ pandn m2, m9, m8 ; fm & !flat8 & !flat16
+ SWAP 2, 8
+ pandn m2, m1, m9 ; flat8 & !flat16
+ SWAP 2, 9
+ SWAP 0, 8
+ SWAP 1, 10
+%else
+ pandn m0, [esp+3*16], m2
+ pandn m2, [esp+6*16], m0
+ SWAP 2, 0
+ pandn m2, m1, [esp+6*16]
+ mova %%flat8mem, m2
+%endif
+%elif %1 != 4
+ %if ARCH_X86_64
+ SWAP 1, 9
+ %else
+ %define m12 m7
+ mova m12, maskmem
+ mova m1, [esp+6*16]
+ %endif
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+ pandn m1, m2
+ pandn m2, m0, m1 ; flat8 & fm
+ pand m1, m12, mask0
+ pcmpeqd m1, m12
+ pandn m0, m1
+ pandn m1, m2, m0 ; fm & !flat8
+ SWAP 1, 2, 0
+ %if ARCH_X86_64
+ SWAP 1, 9
+ %else
+ mova %%flat8mem, m1
+ %endif
+%else
+%if ARCH_X86_32
+ %define m12 m1
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask0
+ pcmpeqd m2, m12
+ pandn m0, m2 ; fm
+%endif
+
+ ; short filter
+
+ mova m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+ SWAP 7, 11
+%else
+ mova m7, [esp+5*16]
+%endif
+ pxor m3, m1
+ pxor m6, m1
+ pxor m4, m1
+ pxor m5, m1
+ psubsb m1, m3, m6 ; iclip_diff(p1-q1)
+ pand m1, m7 ; f=iclip_diff(p1-q1)&hev
+ psubsb m2, m5, m4
+ paddsb m1, m2
+ paddsb m1, m2
+ paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f)
+ mova m2, [PIC_sym(pb_16)]
+ pand m0, m1 ; f&=fm
+ paddsb m1, m0, [PIC_sym(pb_3)]
+ paddsb m0, [PIC_sym(pb_4)]
+ pand m1, [PIC_sym(pb_248)]
+ pand m0, [PIC_sym(pb_248)]
+ psrlq m1, 3
+ psrlq m0, 3
+ pxor m1, m2
+ pxor m0, m2
+ psubb m1, m2 ; f2
+ psubb m0, m2 ; f1
+ mova m2, [PIC_sym(pb_128)]
+ paddsb m4, m1
+ psubsb m5, m0
+ pxor m4, m2
+ pxor m5, m2
+
+ pxor m0, m2
+ pxor m1, m1
+ pavgb m0, m1 ; f=(f1+1)>>1
+ psubb m0, [PIC_sym(pb_64)]
+ pandn m7, m0 ; f&=!hev
+ paddsb m3, m7
+ psubsb m6, m7
+ pxor m3, m2
+ pxor m6, m2
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+%else
+ mova m0, [rsp+12*16]
+ mova m2, [rsp+13*16]
+ mova m7, [rsp+14*16]
+%endif
+
+%if ARCH_X86_64
+ SWAP 1, 10
+ mova %%flat8mem, m9
+ mova %%q2mem, m14
+ mova %%q3mem, m15
+ SWAP 0, 8
+ SWAP 1, 9
+%else
+ %ifidn %2, v
+ mova [esp+17*16], m0
+ mova [esp+19*16], m3
+ mova [esp+21*16], m4
+ mova [esp+22*16], m5
+ mova [esp+23*16], m6
+ %xdefine m11 m3
+ %xdefine m14 m4
+ %xdefine m15 m5
+ %xdefine m10 m6
+ %define m13 %%p2mem
+ %define m8 [esp+17*16]
+ %define m9 %%flat16mem
+ %define m3 [esp+19*16]
+ %define m4 [esp+21*16]
+ %define m5 [esp+22*16]
+ %define m6 [esp+23*16]
+ %else
+ mova [esp+31*16], m0
+ mova [esp+32*16], m3
+ mova [esp+33*16], m4
+ mova [esp+34*16], m5
+ mova [esp+35*16], m6
+ %xdefine m11 m3
+ %xdefine m14 m4
+ %xdefine m15 m5
+ %xdefine m10 m6
+ %define m13 %%p2mem
+ %define m8 [esp+31*16]
+ %define m9 %%flat16mem
+ %define m3 [esp+32*16]
+ %define m4 [esp+33*16]
+ %define m5 [esp+34*16]
+ %define m6 [esp+35*16]
+ %endif
+%endif
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ mova m11, %%p3mem
+%if ARCH_X86_64
+ punpcklbw m14, m8, m11
+ punpckhbw m15, m8, m11
+%else
+ punpcklbw m14, m0, m11
+ punpckhbw m15, m0, m11
+%endif
+%ifidn %2, v
+ mova [rsp+5*16], m11
+%endif
+ pmaddubsw m10, m14, [PIC_sym(pb_7_1)]
+ pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+ pmaddubsw m0, [PIC_sym(pb_2)]
+ pmaddubsw m1, [PIC_sym(pb_2)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3
+ punpcklbw m0, m13, m3
+ punpckhbw m1, m13, m3
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m0, m4, m5
+ punpckhbw m1, m4, m5
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m2
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m0 ; p5
+%else
+ mova [rsp+13*16], m0
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, [PIC_sym(pb_m1_1)]
+ pmaddubsw m15, [PIC_sym(pb_m1_1)]
+ paddw m10, m14
+ paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m0, m8, m6
+ punpckhbw m1, m8, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+3*16], m0
+ mova [rsp+4*16], m1
+ paddw m10, m0
+ paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m7
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m0 ; p4
+%else
+ mova [rsp+14*16], m0
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ mova m14, %%q2mem
+ punpcklbw m0, m8, m13
+ punpckhbw m1, m8, m13
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m0, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m2, [PIC_sym(pb_m1_1)]
+ mova [rsp+1*16], m0
+ paddw m10, m0
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, %%p3mem
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m0 ; p3
+%else
+ mova [rsp+19*16], m0
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ mova m15, %%q3mem
+ punpcklbw m0, m8, m3
+ punpckhbw m1, m8, m3
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m0, m7, m15
+ punpckhbw m7, m15
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ mova [rsp+2*16], m0
+%if ARCH_X86_32
+ %ifidn %2, v
+ mova [esp+24*16], m7
+ %else
+ mova [esp+36*16], m7
+ %endif
+%endif
+ paddw m10, m0
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m13
+ por m0, m1
+ mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+ punpcklbw m0, m8, m4
+ punpckhbw m1, m8, m4
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%if ARCH_X86_64
+ SWAP 7, 8
+%endif
+%ifidn %2, v
+ mova m1, [dstq+strideq*4] ; q4
+ mova m7, [rsp+5*16] ; (pre-filter) p3
+%else
+ mova m1, [rsp+15*16]
+ mova m7, %%p3mem ; (pre-filter) p3
+%endif
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m1, m7
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+7*16], m0
+ mova [rsp+5*16], m1
+ psubw m10, m0
+ psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m3
+ por m0, m1
+ mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, v
+ mova m7, [tmpq+strideq*1] ; p6
+ lea tmpq, [dstq+strideq*4]
+ mova m1, [tmpq+strideq*1] ; q5
+%else
+ mova m7, [rsp+12*16] ; p6
+ mova m1, [rsp+16*16]
+%endif
+ punpcklbw m0, m7, m5
+ punpckhbw m7, m5
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m7, m13, m1
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ mova [rsp+9*16], m7
+ paddw m10, m7
+%if ARCH_X86_64
+ punpckhbw m13, m1
+ mova m1, [rsp+6*16]
+ SWAP 1, 13
+%else
+ punpckhbw m7, m13, m1
+ mova m1, [esp+6*16]
+ mova m13, m1
+ SWAP 1, 7
+%endif
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+10*16], m1
+ paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ pmulhrsw m7, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m0, m11, [PIC_sym(pw_2048)]
+ packuswb m7, m0
+ pand m7, m9
+ pandn m0, m9, m4
+ por m7, m0
+ mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+%ifidn %2, v
+ mova m7, [tmpq+strideq*2] ; q6
+%else
+ mova m7, [rsp+17*16]
+%endif
+ paddw m10, [rsp+3*16]
+ paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ punpcklbw m0, m3, m7
+ punpckhbw m1, m3, m7
+%if ARCH_X86_64
+ mova m3, [rsp+8*16]
+%endif
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+3*16], m0
+ mova [rsp+4*16], m1
+ paddw m10, m0
+ paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m5
+ por m0, m1
+%if ARCH_X86_32
+ mova m1, [esp+8*16]
+ mova m3, m1
+%endif
+ mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ paddw m10, [rsp+1*16]
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+ punpcklbw m0, m4, m7
+ punpckhbw m2, m4, m7
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m2, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+%if ARCH_X86_64
+ mova m4, [rsp+6*16]
+%else
+ %define m4 [esp+6*16]
+%endif
+ pmulhrsw m2, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m2, m1
+ pand m2, m9
+ pandn m1, m9, m6
+ por m2, m1 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, [rsp+2*16]
+%if ARCH_X86_64
+ SWAP 7, 8
+ paddw m11, m7
+%else
+ mova m8, m7
+ %ifidn %2, v
+ paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %else
+ paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %endif
+%endif
+ punpcklbw m0, m5, m8
+ punpckhbw m1, m5, m8
+%if ARCH_X86_64
+ mova m5, [rsp+8*16]
+%else
+ %define m5 [esp+8*16]
+%endif
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m7, m1
+ pand m7, m9
+ pandn m1, m9, m14
+ por m7, m1 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ psubw m10, [rsp+7*16]
+ psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ punpcklbw m0, m6, m8
+ punpckhbw m1, m6, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m15
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m0 ; q3
+%else
+ mova [rsp+20*16], m0
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, [rsp+ 9*16]
+ paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m0, m14, m8
+ punpckhbw m1, m14, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+%ifidn %2, v
+ pandn m1, m9, [tmpq+strideq*0]
+%else
+ pandn m1, m9, [rsp+15*16]
+%endif
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m0 ; q4
+%else
+ mova [rsp+15*16], m0
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, [rsp+3*16]
+ paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m0, m15, m8
+ punpckhbw m1, m15, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, [PIC_sym(pw_2048)]
+ pmulhrsw m11, [PIC_sym(pw_2048)]
+ packuswb m10, m11
+ pand m10, m9
+%ifidn %2, v
+ pandn m11, m9, [tmpq+strideq*1]
+%else
+ pandn m11, m9, [rsp+16*16]
+%endif
+ por m10, m11
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+16*16], m10
+%endif
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 14, 7
+%else
+ %xdefine m3 m11
+ %xdefine m4 m14
+ %xdefine m5 m15
+ %xdefine m6 m10
+ mova %%q2mem, m7
+ %ifidn %2, v
+ mova m3, [esp+19*16]
+ %else
+ mova m3, [esp+32*16]
+ %endif
+ mova m4, [esp+ 6*16]
+ mova m5, [esp+ 8*16]
+%endif
+ SWAP m6, m2
+
+%if ARCH_X86_64
+ mova m9, %%flat8mem
+%endif
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%endif ; if %1 == 16
+%if %1 >= 8
+ ; flat8 filter
+%if ARCH_X86_32
+ %define m9 %%flat8mem
+ %define m11 m1
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+ mova m11, %%p3mem
+ punpcklbw m0, m11, m3
+ punpcklbw m7, m13, m4
+ pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+ pmaddubsw m7, [PIC_sym(pb_2_1)]
+ paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m7, m5, [PIC_sym(pb_4)]
+ pmaddubsw m7, [PIC_sym(pb_1)]
+ paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ punpckhbw m1, m11, m3
+ pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+ punpckhbw m0, m13, m4
+ pmaddubsw m0, [PIC_sym(pb_2_1)]
+ paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpckhbw m0, m5, [PIC_sym(pb_4)]
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m13
+ por m0, m1 ; p2
+%ifidn %2, v
+ mova [tmpq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+ SWAP 0, 10
+ %else
+ mova [esp+2*16], m0
+ %endif
+%endif
+
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m3
+ punpckhbw m1, m11, m3
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m0, m13, m6
+ punpckhbw m1, m13, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m3
+ por m0, m1 ; p1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m0
+%else
+ mova [rsp+0*16], m0
+%endif
+
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m3
+ punpckhbw m1, m11, m3
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m4, m14
+ punpckhbw m1, m4, m14
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m4
+ por m0, m1 ; p0
+%ifidn %2, v
+ mova [tmpq+stride3q], m0
+%else
+ mova [rsp+1*16], m0
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m4
+ punpckhbw m11, m11, m4
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m11, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m0, m2, 3
+ psrlw m11, m7, 3
+ packuswb m0, m11
+ pand m0, m9
+ pandn m11, m9, m5
+ por m11, m0 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%elif ARCH_X86_32
+ mova [esp+8*16], m11
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m0, m13, m6
+ punpckhbw m1, m13, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m6
+ por m0, m1 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+ SWAP 0, 13
+ %else
+ mova [esp+9*16], m0
+ %endif
+%endif
+
+ punpcklbw m0, m3, m6
+ punpckhbw m1, m3, m6
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m14, m15
+ punpckhbw m1, m14, m15
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+ pand m2, m9
+ pandn m7, m9, m14
+ por m2, m7 ; q2
+%ifidn %2, v
+ mova [dstq+strideq*2], m2
+%else
+ mova m0, [rsp+0*16]
+%if %1 == 8
+ mova m1, [rsp+1*16]
+ mova m4, %%p3mem
+
+%if ARCH_X86_32
+ %define m10 [esp+2*16]
+ %define m11 [esp+8*16]
+ %define m13 [esp+9*16]
+%endif
+
+ ; 16x8 transpose
+ punpcklbw m3, m4, m10
+ punpckhbw m4, m10
+ punpcklbw m5, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m11, m13
+ punpckhbw m6, m11, m13
+ punpcklbw m7, m2, m15
+ punpckhbw m2, m15
+%if ARCH_X86_64
+ SWAP 2, 15
+%else
+ mova m15, m2
+%endif
+
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m7
+ punpckhwd m1, m7
+ punpcklwd m7, m6, m15
+ punpckhwd m6, m15
+%if ARCH_X86_64
+ SWAP 6, 15
+%else
+ mova m15, m6
+%endif
+
+ punpckldq m6, m2, m0
+ punpckhdq m2, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ punpckldq m1, m5, m7
+ punpckhdq m5, m7
+ punpckldq m7, m4, m15
+ punpckhdq m4, m15
+
+ ; write 8x16
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm2
+ movhps [dstq+stride3q -4], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm5
+ movhps [dstq+stride3q -4], xm5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm7
+ movhps [dstq+strideq*1-4], xm7
+ movq [dstq+strideq*2-4], xm4
+ movhps [dstq+stride3q -4], xm4
+ lea dstq, [dstq+strideq*4]
+%else
+ ; 16x16 transpose and store
+ SWAP 6, 0
+ SWAP 7, 1
+ %if ARCH_X86_64
+ SWAP 5, 10, 2
+ SWAP 8, 11
+ SWAP 9, 13
+ mova [rsp+21*16], m12
+ %else
+ mova [esp+10*16], m2
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ %endif
+ mova m0, [rsp+11*16]
+ mova m1, [rsp+12*16]
+ mova m2, [rsp+13*16]
+ mova m3, [rsp+14*16]
+ mova m4, [rsp+19*16]
+%if ARCH_X86_64
+ mova m7, [rsp+ 1*16]
+ mova m11, [rsp+20*16]
+ mova m12, [rsp+15*16]
+ mova m13, [rsp+16*16]
+ mova m14, [rsp+17*16]
+ TRANSPOSE_16X16B 1, [rsp+18*16]
+%else
+ mova m5, [esp+ 2*16]
+ TRANSPOSE_16X16B 1, [esp+32*16]
+ mov tmpq, dstq
+ lea dstq, [dstq+strideq*8]
+%endif
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm4
+ movu [dstq+strideq*1-8], xm5
+ movu [dstq+strideq*2-8], xm6
+ movu [dstq+stride3q -8], xm7
+%if ARCH_X86_64
+ lea dstq, [dstq+strideq*4]
+%else
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ mova m8, [esp+11*16]
+ mova m9, [esp+12*16]
+ mova m10, [esp+13*16]
+ mova m11, [esp+14*16]
+ mova m12, [esp+26*16]
+ mova m13, [esp+27*16]
+ mova m14, [esp+ 0*16]
+ mova m15, [esp+ 1*16]
+ mov dstq, tmpq
+%endif
+ movu [dstq+strideq*0-8], xm8
+ movu [dstq+strideq*1-8], xm9
+ movu [dstq+strideq*2-8], xm10
+ movu [dstq+stride3q -8], xm11
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm12
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+%if ARCH_X86_32
+ lea dstq, [dstq+strideq*8]
+%else
+ mova m12, [rsp+21*16]
+%endif
+
+%endif ; if %1 == 8
+%endif ; ifidn %2, v
+%elif %1 == 6
+ ; flat6 filter
+%if ARCH_X86_32
+ mova [esp+3*16], m3
+ mova [esp+4*16], m4
+ mova [esp+5*16], m5
+ mova [esp+6*16], m6
+ %xdefine m8 m3
+ %xdefine m10 m4
+ %xdefine m11 m5
+ %xdefine m15 m6
+ %define m3 [esp+3*16]
+ %define m4 [esp+4*16]
+ %define m5 [esp+5*16]
+ %define m6 [esp+6*16]
+ %define m9 %%flat8mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+%endif
+
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, [PIC_sym(pb_3_1)]
+ pmaddubsw m1, m11, [PIC_sym(pb_3_1)]
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, [PIC_sym(pb_2)]
+ pmaddubsw m15, m10, [PIC_sym(pb_2)]
+ paddw m0, m2
+ paddw m1, m15
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m3
+ por m2, m15
+%ifidn %2, v
+ mova [tmpq+strideq*2], m2 ; p1
+%elif ARCH_X86_32
+ mova [esp+11*16], m2
+%endif
+
+ pmaddubsw m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+%if ARCH_X86_64
+ SWAP 2, 13
+%endif
+ pmaddubsw m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m4
+ por m2, m15
+%ifidn %2, v
+ mova [tmpq+stride3q], m2 ; p0
+%elif ARCH_X86_32
+ mova [esp+8*16], m2
+%endif
+
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+%if ARCH_X86_64
+ SWAP 2, 14
+%endif
+ pmaddubsw m2, m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m15, m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m2
+ paddw m1, m15
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m5
+ por m2, m15
+%ifidn %2, v
+ mova [dstq+strideq*0], m2 ; q0
+%endif
+
+ pmaddubsw m8, [PIC_sym(pb_m1_2)]
+ pmaddubsw m11, [PIC_sym(pb_m1_2)]
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, [PIC_sym(pb_m1_0)]
+ pmaddubsw m10, [PIC_sym(pb_m1_0)]
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, [PIC_sym(pw_4096)]
+ pmulhrsw m1, [PIC_sym(pw_4096)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m6
+ por m0, m1
+%if ARCH_X86_32
+ %xdefine m3 m8
+ %xdefine m4 m10
+ %xdefine m5 m11
+ %xdefine m6 m15
+%endif
+%ifidn %2, v
+ mova [dstq+strideq*1], m0 ; q1
+%else
+ %if ARCH_X86_64
+ SWAP 3, 13
+ SWAP 4, 14
+ %else
+ mova m3, [esp+11*16]
+ mova m4, [esp+ 8*16]
+ %endif
+ SWAP 5, 2
+ SWAP 6, 0
+ TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%else ; if %1 == 4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%endif
+%if ARCH_X86_32
+ %define m12 m12reg
+%endif
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 32-bit PIC helpers ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 0 ; PIC_reg
+ %define PIC_reg r2
+ %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4)
+ LEA PIC_reg, $$
+ %endmacro
+
+ %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base
+ %if %1 == 0
+ mov [esp+PIC_reg_stk_offset], PIC_reg
+ mov PIC_reg, maskm
+ %else
+ mov PIC_reg, [esp+PIC_reg_stk_offset]
+ %endif
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 1
+ %endmacro
+ %define PIC_sym(sym) (sym)
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 1
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 1
+ %if copy_args
+ %define maskm [esp+stack_size-gprsize*1]
+ %define l_stridem [esp+stack_size-gprsize*2]
+ %define lutm [esp+stack_size-gprsize*3]
+ %define %1m [esp+stack_size-gprsize*4]
+ mov r6d, r6m
+ mov maskm, maskd
+ mov lutm, lutd
+ mov %1m, r6d
+ %else
+ %define %1m r6m
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define tmpq r4
+ %define mstrideq r5
+ %define stride3q r6
+ %define l_stride3q r6
+%endif
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS w
+ SETUP_PIC
+ %define m12 m5
+%endif
+ shl l_strideq, 2
+ sub lq, l_strideq
+%if ARCH_X86_64
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movu m0, [maskq]
+ pxor m4, m4
+ movd m3, [lutq+136]
+ pshufb m3, m4
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m1, m2
+ por m0, m1
+ mova [rsp+11*16], m0
+ mova [rsp+12*16], m1
+ mova [rsp+13*16], m2
+ mova [rsp+14*16], m3
+
+%define maskmem [esp+15*16]
+%define mask0 [rsp+11*16]
+%define mask1 [rsp+12*16]
+%define mask2 [rsp+13*16]
+%define minlvl [rsp+14*16]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ je .no_flat16
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_32
+ mova m12, maskmem
+ mov mask_bitsd, [esp+25*16]
+%endif
+.no_filter:
+ pslld m12, 4
+ shl mask_bitsd, 4
+ add lq, 16
+ add dstq, 16
+%if ARCH_X86_64
+ sub wd, 4
+%else
+ sub dword wm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS h
+ SETUP_PIC
+ %define m12 m5
+%endif
+ sub lq, 4
+ shl l_strideq, 2
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movu m0, [maskq]
+ pxor m4, m4
+ movd m3, [lutq+136]
+ pshufb m3, m4
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m1, m2
+ por m0, m1
+ mova [rsp+22*16], m0
+ mova [rsp+23*16], m1
+ mova [rsp+24*16], m2
+ mova [rsp+25*16], m3
+
+%define maskmem [esp+37*16]
+%define mask0 [rsp+22*16]
+%define mask1 [rsp+23*16]
+%define mask2 [rsp+24*16]
+%define minlvl [rsp+25*16]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ je .no_flat16
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+ jmp .end_noload
+.end:
+ mova m12, maskmem
+ mov l_strideq, l_stridem
+ mov mask_bitsd, [esp+38*16]
+.end_noload:
+%else
+.end:
+%endif
+ lea lq, [lq+l_strideq*4]
+ pslld m12, 4
+ shl mask_bitsd, 4
+%if ARCH_X86_64
+ sub hd, 4
+%else
+ sub dword hm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS w
+ SETUP_PIC
+ %define m12 m4
+%endif
+ shl l_strideq, 2
+ sub lq, l_strideq
+%if ARCH_X86_64
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movq m0, [maskq]
+ pxor m3, m3
+ movd m2, [lutq+136]
+ pshufb m2, m3
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m0, m1
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0 [rsp+0*16]
+%define mask1 [rsp+1*16]
+%define minlvl [rsp+2*16]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+11*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[1]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+11*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_32
+ mova m12, maskmem
+ mov mask_bitsd, [esp+11*16]
+%endif
+.no_filter:
+ pslld m12, 4
+ shl mask_bitsd, 4
+ add lq, 16
+ add dstq, 16
+%if ARCH_X86_64
+ sub wd, 4
+%else
+ sub dword wm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS h
+ SETUP_PIC
+ %define m12 m4
+%endif
+ sub lq, 4
+ shl l_strideq, 2
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movq m0, [maskq]
+ pxor m3, m3
+ movd m2, [lutq+136]
+ pshufb m2, m3
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m0, m1
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0 [rsp+0*16]
+%define mask1 [rsp+1*16]
+%define minlvl [rsp+2*16]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+12*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[1]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+12*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+ jmp .end_noload
+.end:
+ mova m12, maskmem
+ mov l_strided, l_stridem
+ mov mask_bitsd, [esp+12*16]
+.end_noload:
+%else
+.end:
+%endif
+ lea lq, [lq+l_strideq*4]
+ pslld m12, 4
+ shl mask_bitsd, 4
+%if ARCH_X86_64
+ sub hd, 4
+%else
+ sub dword hm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
diff --git a/third_party/dav1d/src/x86/looprestoration.h b/third_party/dav1d/src/x86/looprestoration.h
new file mode 100644
index 0000000000..de23be8866
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/intops.h"
+
+#define decl_wiener_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+#define decl_sgr_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
+
+decl_wiener_filter_fns(sse2);
+decl_wiener_filter_fns(ssse3);
+decl_wiener_filter_fns(avx2);
+decl_wiener_filter_fns(avx512icl);
+decl_sgr_filter_fns(ssse3);
+decl_sgr_filter_fns(avx2);
+decl_sgr_filter_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
+ }
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl);
+#if BITDEPTH == 8
+ /* With VNNI we don't need a 5-tap version. */
+ c->wiener[1] = c->wiener[0];
+#else
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
+ }
+#endif
+}
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
new file mode 100644
index 0000000000..ef25c28474
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
@@ -0,0 +1,2540 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15
+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pb_m10_m9: times 2 db -10, -9
+pb_m6_m5: times 2 db -6, -5
+pb_m2_m1: times 2 db -2, -1
+pb_2_3: times 2 db 2, 3
+pb_6_7: times 2 db 6, 7
+pw_1023: times 2 dw 1023
+pd_8: dd 8
+pd_25: dd 25
+pd_4096: dd 4096
+pd_34816: dd 34816
+pd_m262128: dd -262128
+pd_0xf00800a4: dd 0xf00800a4
+pd_0xf00801c7: dd 0xf00801c7
+
+%define pw_256 sgr_lshuf5
+
+cextern sgr_x_by_x_avx2
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
+
+INIT_YMM avx2
+cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm1, r10d
+ vpbroadcastd m0, [pb_6_7]
+ movu m2, [pb_0to31]
+ vpbroadcastb m1, xm1
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m3, m0
+ vpbroadcastd m0, [pb_m2_m1]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m4, m0
+ vpbroadcastd m0, [pb_m10_m9]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m5, m0
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vpblendd m3, [lpfq+r10-8], 0xfc
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ shufpd m3, m4, 0x05
+ pshufb m3, [wiener_lshuf7]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18)
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vpblendd m3, [lpfq+r10-8], 0xfc
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ movu m3, [lpfq+r10-8]
+ pshufb m3, [wiener_lshuf7]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -36
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ vpbroadcastd m2, [pd_m262128]
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m0, m2, m5
+ pmaddwd m0, m15
+ punpckhwd m2, m5
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m4
+ pmaddwd m1, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m2, m10
+ paddd m0, m1
+ paddd m2, m3
+ psrad m0, 5
+ psrad m2, 5
+ packusdw m0, m2
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m1, [t4+r10]
+ paddw m1, [t2+r10]
+ mova m2, [t3+r10]
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m15
+ punpckhwd m1, m2
+ pmaddwd m1, m15
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m1, m10
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ movu xm15, [wiener_lshuf5]
+ pmullw m11, m0
+ vinserti128 m15, [pb_0to31], 1
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_2_3]
+ vpbroadcastd m1, [pb_m6_m5]
+ vpbroadcastb m2, xm2
+ psubb m0, m2
+ psubb m1, m2
+ movu m2, [pb_0to31]
+ pminub m0, m2
+ pminub m1, m2
+ pshufb m3, m0
+ pshufb m4, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vpblendd m3, [lpfq+r10-4], 0xfe
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located
+ mova m3, [lpfq+r10] ; before the start of the buffer
+ palignr m3, m4, 12
+ pshufb m3, m15
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vpblendd m3, [lpfq+r10-4], 0xfe
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ movu m3, [lpfq+r10-4]
+ pshufb m3, m15
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ paddd m1, m3
+ mova m4, [t2+r10]
+ punpckhwd m3, m2, m4
+ pmaddwd m3, m14
+ punpcklwd m2, m4
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ pmaddwd m2, m14
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ movifnidn hd, hm
+ mov edged, r7m
+ add wd, wd
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add lpfq, wq
+ vpbroadcastd m8, [pd_8]
+ add dstq, wq
+ vpbroadcastd m9, [pd_25]
+ lea t3, [rsp+wq*2+400*12+16]
+ vpbroadcastd m10, [paramsq+0] ; s0
+ lea t4, [rsp+wq+400*20+16]
+ vpbroadcastd m11, [pd_0xf00800a4]
+ lea t1, [rsp+wq+20]
+ mova xm12, [sgr_lshuf5]
+ neg wq
+ vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15)
+ pxor m6, m6
+ vpbroadcastd m14, [pw_1023]
+ psllw m7, 4
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ vpbroadcastw m0, [lpfq-2]
+ movu m1, [r13+r10+ 0]
+ movu m2, [r13+r10+16]
+ vpblendvb m4, m0, m1
+ vpblendvb m5, m0, m2
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10- 2]
+.h_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m5, m4, m5, 0x05
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+400*0]
+ paddd m1, [t1+r10+400*2]
+ paddd m2, [t1+r10+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+r10+400*0], m0
+ mova [t1+r10+400*2], m1
+ mova [t1+r10+400*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+400*0]
+ mova m1, [t1+r10+400*2]
+ mova m2, [t1+r10+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m1
+ mova [t2+r10+400*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv_main
+.hv_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10- 2]
+.hv_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -36
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m5, m4, m5, 0x05
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+r10+400*0]
+ paddd m4, m2, [t1+r10+400*2]
+ paddd m5, m3, [t1+r10+400*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+400*0] ; hv sum
+ paddd m4, [t2+r10+400*2] ; hv sumsq
+ paddd m5, [t2+r10+400*4]
+ mova [t0+r10+400*0], m0
+ mova [t0+r10+400*2], m2
+ mova [t0+r10+400*4], m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ punpcklwd m2, m3, m6
+ psrld m5, 4
+ punpckhwd m3, m6
+ pmulld m4, m9 ; a * 25
+ pmulld m5, m9
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ pmaxud m5, m3
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m10 ; p * s
+ pmulld m5, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+r10+4], m2
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+r10*2+ 8], xm0
+ vextracti128 [t3+r10*2+40], m0, 1
+ mova [t3+r10*2+24], xm1
+ vextracti128 [t3+r10*2+56], m1, 1
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+400*0], m1
+ paddw m1, m0
+ mova [t1+r10+400*2], m4
+ paddd m4, m2
+ mova [t1+r10+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m0, [t1+r10+400*0]
+ mova m2, [t1+r10+400*2]
+ mova m3, [t1+r10+400*4]
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m4, m2, [t2+r10+400*2]
+ paddd m5, m3, [t2+r10+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ punpcklwd m2, m3, m6
+ psrld m5, 4
+ punpckhwd m3, m6
+ pmulld m4, m9 ; a * 25
+ pmulld m5, m9
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ pmaxud m5, m3
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m10 ; p * s
+ pmulld m5, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+r10+4], m2
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+r10*2+ 8], xm0
+ vextracti128 [t3+r10*2+40], m0, 1
+ mova [t3+r10*2+24], xm1
+ vextracti128 [t3+r10*2+56], m1, 1
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+36]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m4, m1, [t3+r10*2+ 0]
+ paddd m5, m2, [t3+r10*2+32]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m4, [t3+r10*2+ 8]
+ paddd m5, [t3+r10*2+40]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+r10*1+400*2+ 0], m0
+ mova [t3+r10*2+400*4+ 0], m1
+ mova [t3+r10*2+400*4+32], m2
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+36]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m4, m1, [t3+r10*2+ 0]
+ paddd m5, m2, [t3+r10*2+32]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m4, [t3+r10*2+ 8]
+ paddd m5, [t3+r10*2+40]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+r10*1+400*2+ 0]
+ paddd m4, m1, [t3+r10*2+400*4+ 0]
+ paddd m5, m2, [t3+r10*2+400*4+32]
+ mova [t4+r10*1+400*2+ 0], m0
+ mova [t3+r10*2+400*4+ 0], m1
+ mova [t3+r10*2+400*4+32], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+400*2+ 0]
+ mova m4, [t3+r10*2+400*4+ 0]
+ mova m5, [t3+r10*2+400*4+32]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m4, m3
+ psrad m1, 8
+ psrad m4, 8
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ add wd, wd
+ movifnidn hd, hm
+ mov edged, r7m
+ add lpfq, wq
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add dstq, wq
+ vpbroadcastd m9, [paramsq+ 4] ; s1
+ lea t3, [rsp+wq*2+400*12+8]
+ vpbroadcastd m8, [pd_8]
+ lea t4, [rsp+wq+400*32+8]
+ vpbroadcastd m10, [pd_0xf00801c7]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m11, [pd_34816]
+ neg wq
+ mova xm12, [sgr_lshuf3]
+ pxor m6, m6
+ vpbroadcastd m13, [pw_1023]
+ psllw m7, 4
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+400*0]
+ mova m1, [t1+r10+400*2]
+ mova m2, [t1+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m1
+ mova [t2+r10+400*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.extend_right:
+ vpbroadcastw m0, [lpfq-2]
+ movu m1, [r13+r10+ 2]
+ movu m2, [r13+r10+18]
+ vpblendvb m4, m0, m1
+ vpblendvb m5, m0, m2
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10+ 0]
+.h_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+r10+400*0], m1
+ mova [t1+r10+400*2], m2
+ mova [t1+r10+400*4], m3
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m4, [lpfq+r10+ 0]
+.hv0_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -34
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+r10+400*0]
+ paddd m4, m2, [t1+r10+400*2]
+ paddd m5, m3, [t1+r10+400*4]
+ mova [t1+r10+400*0], m1
+ mova [t1+r10+400*2], m2
+ mova [t1+r10+400*4], m3
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m2
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m4, [lpfq+r10+ 0]
+.hv1_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -34
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m4, m2, [t2+r10+400*2]
+ paddd m5, m3, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m2
+ mova [t2+r10+400*4], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ psrld m5, 4
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2 +4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m2
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova xm0, [t4+r10*1+400*0+0]
+ paddw xm0, [t4+r10*1+400*0+4]
+ paddw xm2, xm0, [t4+r10*1+400*0+2]
+ mova m1, [t3+r10*2+400*0+0]
+ paddd m1, [t3+r10*2+400*0+8]
+ paddd m3, m1, [t3+r10*2+400*0+4]
+ psllw xm2, 2 ; a[-1] 444
+ pslld m3, 2 ; b[-1] 444
+ psubw xm2, xm0 ; a[-1] 343
+ psubd m3, m1 ; b[-1] 343
+ mova [t4+r10*1+400* 4], xm2
+ mova [t3+r10*2+400* 8], m3
+ mova xm0, [t4+r10*1+400*2+0]
+ paddw xm0, [t4+r10*1+400*2+4]
+ paddw xm2, xm0, [t4+r10*1+400*2+2]
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m3, m1, [t3+r10*2+400*4+4]
+ psllw xm2, 2 ; a[ 0] 444
+ pslld m3, 2 ; b[ 0] 444
+ mova [t4+r10*1+400* 6], xm2
+ mova [t3+r10*2+400*12], m3
+ psubw xm2, xm0 ; a[ 0] 343
+ psubd m3, m1 ; b[ 0] 343
+ mova [t4+r10*1+400* 8], xm2
+ mova [t3+r10*2+400*16], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+400*0+0]
+ paddw m3, [t4+r10*1+400*0+4]
+ paddw m1, m3, [t4+r10*1+400*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+400*4]
+ paddw m3, [t4+r10*1+400*6]
+ mova [t4+r10*1+400*4], m2
+ mova [t4+r10*1+400*6], m1
+ mova m4, [t3+r10*2+400*0+0]
+ paddd m4, [t3+r10*2+400*0+8]
+ paddd m1, m4, [t3+r10*2+400*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+r10*2+400* 8+ 0]
+ paddd m4, [t3+r10*2+400*12+ 0]
+ mova [t3+r10*2+400* 8+ 0], m2
+ mova [t3+r10*2+400*12+ 0], m1
+ mova m5, [t3+r10*2+400*0+32]
+ paddd m5, [t3+r10*2+400*0+40]
+ paddd m1, m5, [t3+r10*2+400*0+36]
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+r10*2+400* 8+32]
+ paddd m5, [t3+r10*2+400*12+32]
+ mova [t3+r10*2+400* 8+32], m2
+ mova [t3+r10*2+400*12+32], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+400*2+0]
+ paddw m3, [t4+r10*1+400*2+4]
+ paddw m1, m3, [t4+r10*1+400*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+400*6]
+ paddw m3, [t4+r10*1+400*8]
+ mova [t4+r10*1+400*6], m1
+ mova [t4+r10*1+400*8], m2
+ mova m4, [t3+r10*2+400*4+0]
+ paddd m4, [t3+r10*2+400*4+8]
+ paddd m1, m4, [t3+r10*2+400*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+r10*2+400*12+ 0]
+ paddd m4, [t3+r10*2+400*16+ 0]
+ mova [t3+r10*2+400*12+ 0], m1
+ mova [t3+r10*2+400*16+ 0], m2
+ mova m5, [t3+r10*2+400*4+32]
+ paddd m5, [t3+r10*2+400*4+40]
+ paddd m1, m5, [t3+r10*2+400*4+36]
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+r10*2+400*12+32]
+ paddd m5, [t3+r10*2+400*16+32]
+ mova [t3+r10*2+400*12+32], m1
+ mova [t3+r10*2+400*16+32], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ add wd, wd
+ movifnidn hd, hm
+ mov edged, r7m
+ add lpfq, wq
+ vpbroadcastd m15, [paramsq+8] ; w0 w1
+ add dstq, wq
+ vpbroadcastd m13, [paramsq+0] ; s0
+ lea t3, [rsp+wq*2+400*24+8]
+ vpbroadcastd m14, [paramsq+4] ; s1
+ lea t4, [rsp+wq+400*52+8]
+ vpbroadcastd m9, [pd_8]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m10, [pd_34816]
+ neg wq
+ vpbroadcastd m11, [pd_4096]
+ pxor m7, m7
+ vpbroadcastd m12, [pd_0xf00801c7]
+ psllw m15, 2
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+400* 0]
+ mova m1, [t1+r10+400* 2]
+ mova m2, [t1+r10+400* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+400* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+400* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+400*10]
+ mova [t2+r10+400* 0], m0
+ mova [t2+r10+400* 2], m1
+ mova [t2+r10+400* 4], m2
+ mova [t2+r10+400* 6], m3
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10- 2]
+.h_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.h_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m6, m0, m7
+ pmaddwd m6, m6
+ punpckhwd m0, m7
+ pmaddwd m0, m0
+ paddd m2, m6 ; sumsq3
+ shufpd m6, m4, m5, 0x05
+ punpcklwd m5, m6, m4
+ paddw m8, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ paddd m3, m0
+ mova [t1+r10+400* 6], m1
+ mova [t1+r10+400* 8], m2
+ mova [t1+r10+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m5, m2 ; sumsq5
+ paddd m6, m3
+ mova [t1+r10+400* 0], m8
+ mova [t1+r10+400* 2], m5
+ mova [t1+r10+400* 4], m6
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m4, [lpfq+r10- 2]
+.hv0_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -36
+ jl .hv0_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.hv0_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m6, m0, m7
+ pmaddwd m6, m6
+ punpckhwd m0, m7
+ pmaddwd m0, m0
+ paddd m2, m6 ; h sumsq3
+ shufpd m6, m4, m5, 0x05
+ punpcklwd m5, m6, m4
+ paddw m8, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m5, m2 ; h sumsq5
+ paddd m6, m3
+ mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
+ mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
+ mova [t3+r10*2+400*0+40], m6
+ paddw m8, [t1+r10+400* 0]
+ paddd m5, [t1+r10+400* 2]
+ paddd m6, [t1+r10+400* 4]
+ mova [t1+r10+400* 0], m8
+ mova [t1+r10+400* 2], m5
+ mova [t1+r10+400* 4], m6
+ paddw m0, m1, [t1+r10+400* 6]
+ paddd m4, m2, [t1+r10+400* 8]
+ paddd m5, m3, [t1+r10+400*10]
+ mova [t1+r10+400* 6], m1
+ mova [t1+r10+400* 8], m2
+ mova [t1+r10+400*10], m3
+ paddw m1, m0, [t2+r10+400* 6]
+ paddd m2, m4, [t2+r10+400* 8]
+ paddd m3, m5, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m0
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m4, [lpfq+r10- 2]
+.hv1_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -36
+ jl .hv1_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.hv1_have_right:
+ palignr m6, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m6, m3
+ punpcklwd m0, m6, m3
+ pmaddwd m0, m0
+ punpckhwd m6, m3
+ pmaddwd m6, m6
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m7
+ pmaddwd m1, m1
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ shufpd m1, m4, m5, 0x05
+ punpckhwd m5, m4, m1
+ paddw m8, m4, m1
+ pmaddwd m5, m5
+ punpcklwd m4, m1
+ pmaddwd m4, m4
+ paddd m6, m3
+ paddw m1, m2, [t2+r10+400* 6]
+ mova [t2+r10+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+r10+400* 8]
+ paddd m3, m6, [t2+r10+400*10]
+ mova [t2+r10+400* 8], m0
+ mova [t2+r10+400*10], m6
+ paddd m4, m0 ; h sumsq5
+ paddd m5, m6
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m0, m2, 3
+ pslld m6, m3, 3
+ paddd m2, m0 ; ((a3 + 8) >> 4) * 9
+ paddd m3, m6
+ psrlw m6, m1, 1
+ pavgw m6, m7 ; (b3 + 2) >> 2
+ punpcklwd m0, m6, m7
+ pmaddwd m0, m0
+ punpckhwd m6, m7
+ pmaddwd m6, m6
+ pmaxud m2, m0
+ psubd m2, m0 ; p3
+ pmaxud m3, m6
+ psubd m3, m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmulld m2, m14 ; p3 * s1
+ pmulld m3, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrad m7, m2, 20 ; min(z3, 255) - 256
+ vpgatherdd m6, [r13+m7*4], m2 ; x3
+ psrad m2, m3, 20
+ vpgatherdd m7, [r13+m2*4], m3
+ pmulld m0, m6
+ packssdw m6, m7
+ pmulld m7, m1
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m7, m10
+ psrld m0, 12
+ psrld m7, 12
+ paddw m1, m8, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ paddw m1, [t1+r10+400*0]
+ paddd m2, [t1+r10+400*2]
+ paddd m3, [t1+r10+400*4]
+ mova [t2+r10+400*0], m8
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ mova [t4+r10*1+400*4 +4], m6
+ mova [t3+r10*2+400*8+ 8], xm0
+ vextracti128 [t3+r10*2+400*8+40], m0, 1
+ mova [t3+r10*2+400*8+24], xm7
+ vextracti128 [t3+r10*2+400*8+56], m7, 1
+ vpbroadcastd m4, [pd_25]
+ pxor m7, m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
+ pmulld m3, m4
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmaxud m2, m4
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [pd_0xf00800a4]
+ pmaxud m3, m5
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r13+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r13+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ packssdw m4, m5
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m4
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+400* 6]
+ mova m4, [t1+r10+400* 8]
+ mova m5, [t1+r10+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10+400* 6]
+ paddd m2, m4, [t2+r10+400* 8]
+ paddd m3, m5, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m0
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ mova [t3+r10*2+400*8+ 8], m3
+ mova [t3+r10*2+400*0+ 8], m4
+ mova [t3+r10*2+400*0+40], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+400*0], m3
+ mova [t1+r10+400*2], m4
+ mova [t1+r10+400*4], m5
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m4, [t1+r10+400* 6]
+ mova m5, [t1+r10+400* 8]
+ mova m6, [t1+r10+400*10]
+ paddw m1, m4, [t2+r10+400* 6]
+ paddd m2, m5, [t2+r10+400* 8]
+ paddd m3, m6, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m4
+ mova [t2+r10+400* 8], m5
+ mova [t2+r10+400*10], m6
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova [t4+r10*1+400*4+4], m2
+ mova m4, [t3+r10*2+400*8+ 8]
+ mova m5, [t3+r10*2+400*0+ 8]
+ mova m6, [t3+r10*2+400*0+40]
+ paddw m1, m4, [t2+r10+400*0]
+ paddd m2, m5, [t2+r10+400*2]
+ paddd m3, m6, [t2+r10+400*4]
+ paddw m1, [t1+r10+400*0]
+ paddd m2, [t1+r10+400*2]
+ paddd m3, [t1+r10+400*4]
+ mova [t2+r10+400*0], m4
+ mova [t2+r10+400*2], m5
+ mova [t2+r10+400*4], m6
+ vpbroadcastd m4, [pd_25]
+ mova [t3+r10*2+400*8+ 8], xm0
+ vextracti128 [t3+r10*2+400*8+40], m0, 1
+ mova [t3+r10*2+400*8+24], xm8
+ vextracti128 [t3+r10*2+400*8+56], m8, 1
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
+ pmulld m3, m4
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmaxud m2, m4
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [pd_0xf00800a4]
+ pmaxud m3, m5
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r13+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r13+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ packssdw m4, m5
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m4
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu xm0, [t4+r10*1+400*0+2]
+ paddw xm2, xm0, [t4+r10*1+400*0+0]
+ paddw xm2, [t4+r10*1+400*0+4]
+ movu m1, [t3+r10*2+400*0+4]
+ paddd m3, m1, [t3+r10*2+400*0+0]
+ paddd m3, [t3+r10*2+400*0+8]
+ paddw xm0, xm2
+ paddd m1, m3
+ psllw xm2, 2
+ pslld m3, 2
+ paddw xm0, xm2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+400* 6], xm0
+ mova [t3+r10*2+400*12], m1
+ mova xm0, [t4+r10*1+400*2+0]
+ paddw xm0, [t4+r10*1+400*2+4]
+ paddw xm2, xm0, [t4+r10*1+400*2+2]
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m3, m1, [t3+r10*2+400*4+4]
+ psllw xm2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw xm2, xm0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+400* 8], xm2
+ mova [t3+r10*2+400*16], m3
+ mova xm0, [t4+r10*1+400*4+0]
+ paddw xm0, [t4+r10*1+400*4+4]
+ paddw xm2, xm0, [t4+r10*1+400*4+2]
+ mova m1, [t3+r10*2+400*8+0]
+ paddd m1, [t3+r10*2+400*8+8]
+ paddd m3, m1, [t3+r10*2+400*8+4]
+ psllw xm2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+400*10], xm2
+ mova [t3+r10*2+400*20], m3
+ psubw xm2, xm0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+400*12], xm2
+ mova [t3+r10*2+400*24], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu xm2, [t4+r10*1+2]
+ paddw xm0, xm2, [t4+r10*1+0]
+ paddw xm0, [t4+r10*1+4]
+ paddw xm2, xm0
+ psllw xm0, 2
+ paddw xm0, xm2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw xm2, xm0, [t4+r10*1+400* 6]
+ mova [t4+r10*1+400* 6], xm0
+ paddd m0, m4, [t3+r10*2+400*12]
+ mova [t3+r10*2+400*12], m4
+ mova xm3, [t4+r10*1+400*2+0]
+ paddw xm3, [t4+r10*1+400*2+4]
+ paddw xm5, xm3, [t4+r10*1+400*2+2]
+ psllw xm5, 2 ; a3[ 1] 444
+ psubw xm4, xm5, xm3 ; a3[ 1] 343
+ paddw xm3, xm4, [t4+r10*1+400* 8]
+ paddw xm3, [t4+r10*1+400*10]
+ mova [t4+r10*1+400* 8], xm4
+ mova [t4+r10*1+400*10], xm5
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m5, m1, [t3+r10*2+400*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+400*16]
+ paddd m1, [t3+r10*2+400*20]
+ mova [t3+r10*2+400*16], m4
+ mova [t3+r10*2+400*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, xm2 ; a5
+ pmovzxwd m3, xm3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ pslld m4, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ paddd m4, m11
+ paddd m0, m4
+ psrad m0, 7
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1 ; clip
+ psrlw xm0, 6
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova xm3, [t4+r10*1+400*4+0]
+ paddw xm3, [t4+r10*1+400*4+4]
+ paddw xm5, xm3, [t4+r10*1+400*4+2]
+ psllw xm5, 2 ; a3[ 1] 444
+ psubw xm4, xm5, xm3 ; a3[ 1] 343
+ paddw xm3, xm4, [t4+r10*1+400*12]
+ paddw xm3, [t4+r10*1+400*10]
+ mova [t4+r10*1+400*10], xm5
+ mova [t4+r10*1+400*12], xm4
+ mova m1, [t3+r10*2+400*8+0]
+ paddd m1, [t3+r10*2+400*8+8]
+ paddd m5, m1, [t3+r10*2+400*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+400*24]
+ paddd m1, [t3+r10*2+400*20]
+ mova [t3+r10*2+400*20], m5
+ mova [t3+r10*2+400*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+400* 6]
+ pmovzxwd m3, xm3
+ mova m0, [t3+r10*2+400*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ pslld m4, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 8
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ paddd m4, m11
+ paddd m0, m4
+ psrad m0, 7
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1 ; clip
+ psrlw xm0, 6
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx512.asm b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
new file mode 100644
index 0000000000..e560c54a40
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
@@ -0,0 +1,2524 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+r_ext_mask: times 72 db -1
+ times 8 db 0
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pw_164_455: dw 164, 455
+pw_1023: times 2 dw 1023
+pw_61448: times 2 dw 61448
+pd_m262128: dd -262128
+pd_m34816: dd -34816
+pd_m25: dd -25
+pd_m9: dd -9
+pd_8: dd 8
+pd_2147483648: dd 2147483648
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ mov r10d, 0xfe
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ kmovb k1, r10d
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ vpbroadcastd m16, [pd_m262128]
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ mova m1, m16
+ pshufb m4, m8
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .hv_main2
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+.hv_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ pshufb m4, m8
+ mova m1, m16
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m1, m2, m5
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m5
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m3, [t3+r10]
+ punpcklwd m1, m2, m3
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m3
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base r13-r_ext_mask-70
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea r13, [r_ext_mask+70]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ neg wq
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ mov r10d, 0xfffe
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ kmovw k1, r10d
+ pmullw m11, m0
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.h_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ vpdpwssd m1, m3, m12
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.hv_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m4, m3
+ vpdpwssd m1, m4, m12
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ mova m3, [t2+r10]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ mova m3, m9
+ vpdpwssd m3, m2, m14
+ mova m2, m9
+ vpdpwssd m2, m4, m14
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpcklwd m1, m0, m4
+ vpdpwssd m2, m1, m13
+ punpckhwd m0, m4
+ vpdpwssd m3, m0, m13
+ psrad m2, 5
+ psrad m3, 5
+ packusdw m2, m3
+ pmulhuw m2, m10
+ mova [dstq+r10], m2
+ add r10, 64
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r13-r_ext_mask-72
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m25]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*20+8]
+ vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3)
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15))
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffff8
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m2, m17, m16, 2
+ paddw m0, m16, m2
+ palignr m3, m17, m16, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3
+ punpckhwd m3, m16, m17
+ vpdpwssd m2, m3, m3
+ shufps m16, m17, q2121
+ paddw m0, m16 ; sum
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+416*0]
+ paddd m1, [t1+r10+416*2]
+ paddd m2, [t1+r10+416*4]
+.h_loop_end:
+ punpcklwd m17, m16, m6
+ vpdpwssd m1, m17, m17 ; sumsq
+ punpckhwd m16, m6
+ vpdpwssd m2, m16, m16
+ mova [t1+r10+416*0], m0
+ mova [t1+r10+416*2], m1
+ mova [t1+r10+416*4], m2
+ add r10, 64
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10- 2]
+.hv_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv_have_right:
+ palignr m3, m17, m16, 2
+ paddw m0, m16, m3
+ palignr m1, m17, m16, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1
+ punpckhwd m1, m16, m17
+ vpdpwssd m3, m1, m1
+ shufps m16, m17, q2121
+ paddw m0, m16 ; h sum
+ punpcklwd m17, m16, m6
+ vpdpwssd m2, m17, m17 ; h sumsq
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+416*0] ; hv sum
+ paddd m16, [t2+r10+416*2] ; hv sumsq
+ paddd m17, [t2+r10+416*4]
+ mova [t0+r10+416*0], m0
+ mova [t0+r10+416*2], m2
+ mova [t0+r10+416*4], m3
+ psrlw m3, m1, 1
+ paddd m16, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+416*0], m1
+ paddw m1, m0
+ mova [t1+r10+416*2], m16
+ paddd m16, m2
+ mova [t1+r10+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m2, [t1+r10+416*2]
+ mova m3, [t1+r10+416*4]
+ mova m0, [t1+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ add r10, 64
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ paddw m3, m0, [t4+r10*1+416*2+ 0]
+ paddd m16, m1, [t3+r10*2+416*4+ 0]
+ paddd m17, m2, [t3+r10*2+416*4+64]
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+416*2+ 0]
+ mova m16, [t3+r10*2+416*4+ 0]
+ mova m17, [t3+r10*2+416*4+64]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m16, m3
+ psrad m1, 8
+ psrad m16, 8
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m9]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+4] {1to16} ; -s1
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*32+8]
+ vpbroadcastd m12, [base+pw_61448]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816]
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffffc
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10+ 0]
+.h_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv0_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -66
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m0, m1, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv1_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -66
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ paddw m0, m16, m1
+ punpcklwd m2, m16, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m1
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m17, m6
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m2
+ mova [t2+r10+416*4], m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -((a + 8) >> 4) * 9
+ pmulld m17, m9
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m6, m1 ; b
+ punpckhwd m1, m6, m1
+ pminsd m16, m6
+ pminsd m17, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 455
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10*1+416*2+4], m16
+ psrld m16, m0, 12
+ psrld m17, m1, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m16, m16
+ paddd m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*2+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova ym16, [t4+r10*1+416*0+0]
+ paddw ym16, [t4+r10*1+416*0+4]
+ paddw ym17, ym16, [t4+r10*1+416*0+2]
+ mova m0, [t3+r10*2+416*0+0]
+ paddd m0, [t3+r10*2+416*0+8]
+ paddd m1, m0, [t3+r10*2+416*0+4]
+ psllw ym17, 2 ; a[-1] 444
+ pslld m1, 2 ; b[-1] 444
+ psubw ym17, ym16 ; a[-1] 343
+ psubd m1, m0 ; b[-1] 343
+ vmovdqa32 [t4+r10*1+416* 4], ym17
+ vmovdqa32 [t3+r10*2+416* 8], m1
+ mova ym16, [t4+r10*1+416*2+0]
+ paddw ym16, [t4+r10*1+416*2+4]
+ paddw ym17, ym16, [t4+r10*1+416*2+2]
+ mova m0, [t3+r10*2+416*4+0]
+ paddd m0, [t3+r10*2+416*4+8]
+ paddd m1, m0, [t3+r10*2+416*4+4]
+ psllw ym17, 2 ; a[ 0] 444
+ pslld m1, 2 ; b[ 0] 444
+ vmovdqa32 [t4+r10*1+416* 6], ym17
+ vmovdqa32 [t3+r10*2+416*12], m1
+ psubw ym17, ym16 ; a[ 0] 343
+ psubd m1, m0 ; b[ 0] 343
+ vmovdqa32 [t4+r10*1+416* 8], ym17
+ vmovdqa32 [t3+r10*2+416*16], m1
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+416*0+0]
+ paddw m3, [t4+r10*1+416*0+4]
+ paddw m1, m3, [t4+r10*1+416*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*4]
+ paddw m3, [t4+r10*1+416*6]
+ mova [t4+r10*1+416*4], m2
+ mova [t4+r10*1+416*6], m1
+ mova m16, [t3+r10*2+416*0+0]
+ paddd m16, [t3+r10*2+416*0+8]
+ paddd m1, m16, [t3+r10*2+416*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416* 8+ 0]
+ paddd m16, [t3+r10*2+416*12+ 0]
+ mova [t3+r10*2+416* 8+ 0], m2
+ mova [t3+r10*2+416*12+ 0], m1
+ mova m17, [t3+r10*2+416*0+64]
+ paddd m17, [t3+r10*2+416*0+72]
+ paddd m1, m17, [t3+r10*2+416*0+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416* 8+64]
+ paddd m17, [t3+r10*2+416*12+64]
+ mova [t3+r10*2+416* 8+64], m2
+ mova [t3+r10*2+416*12+64], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+416*2+0]
+ paddw m3, [t4+r10*1+416*2+4]
+ paddw m1, m3, [t4+r10*1+416*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*6]
+ paddw m3, [t4+r10*1+416*8]
+ mova [t4+r10*1+416*6], m1
+ mova [t4+r10*1+416*8], m2
+ mova m16, [t3+r10*2+416*4+0]
+ paddd m16, [t3+r10*2+416*4+8]
+ paddd m1, m16, [t3+r10*2+416*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416*12+ 0]
+ paddd m16, [t3+r10*2+416*16+ 0]
+ mova [t3+r10*2+416*12+ 0], m1
+ mova [t3+r10*2+416*16+ 0], m2
+ mova m17, [t3+r10*2+416*4+64]
+ paddd m17, [t3+r10*2+416*4+72]
+ paddd m1, m17, [t3+r10*2+416*4+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416*12+64]
+ paddd m17, [t3+r10*2+416*16+64]
+ mova [t3+r10*2+416*12+64], m1
+ mova [t3+r10*2+416*16+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ vpbroadcastd m7, [paramsq+8] ; w0 w1
+ pxor m6, m6
+ vpbroadcastd m8, [base+pd_8]
+ add wd, wd
+ vpbroadcastd m9, [base+pd_m9]
+ add lpfq, wq
+ vpbroadcastd m10, [base+pd_m25]
+ add dstq, wq
+ vpsubd m11, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*24+8]
+ vpsubd m12, m6, [paramsq+4] {1to16} ; -s1
+ lea t4, [rsp+wq+416*52+8]
+ vpbroadcastd m13, [base+pw_164_455]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m14, [base+pw_61448]
+ neg wq
+ vpbroadcastd m15, [base+pd_m34816]
+ psllw m7, 2
+ vpbroadcastd m22, [base+pd_2147483648]
+ mov r10d, 0xfffffff8
+ mova m18, [sgr_x_by_x+64*0]
+ kmovd k1, r10d
+ mova m19, [sgr_x_by_x+64*1]
+ mov r10, 0x3333333333333333
+ mova m20, [sgr_x_by_x+64*2]
+ kmovq k2, r10
+ mova m21, [sgr_x_by_x+64*3]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+416* 0]
+ mova m1, [t1+r10+416* 2]
+ mova m2, [t1+r10+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+416* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+416* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+416*10]
+ mova [t2+r10+416* 0], m0
+ mova [t2+r10+416* 2], m1
+ mova [t2+r10+416* 4], m2
+ mova [t2+r10+416* 6], m3
+ mova [t2+r10+416* 8], m4
+ mova [t2+r10+416*10], m5
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m4, m16, m17, 0x55
+ punpcklwd m17, m4, m16
+ paddw m0, m16, m4
+ punpckhwd m4, m16
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ mova [t1+r10+416*10], m3
+ paddw m1, m0 ; sum5
+ vpdpwssd m2, m17, m17 ; sumsq5
+ vpdpwssd m3, m4, m4
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10- 2]
+.hv0_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -68
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; h sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m17, m16, m17, 0x55
+ paddw m4, m1, [t1+r10+416* 6]
+ paddd m5, m2, [t1+r10+416* 8]
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ paddw m1, m16
+ paddw m1, m17 ; h sum5
+ punpcklwd m0, m17, m16
+ vpdpwssd m2, m0, m0 ; h sumsq5
+ paddd m0, m3, [t1+r10+416*10]
+ mova [t1+r10+416*10], m3
+ punpckhwd m17, m16
+ vpdpwssd m3, m17, m17
+ mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row
+ mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd
+ mova [t3+r10*2+416*0+72], m3
+ paddw m1, [t1+r10+416* 0]
+ paddd m2, [t1+r10+416* 2]
+ paddd m3, [t1+r10+416* 4]
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ paddw m17, m4, [t2+r10+416* 6]
+ paddd m2, m5, [t2+r10+416* 8]
+ paddd m3, m0, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m4
+ mova [t2+r10+416* 8], m5
+ mova [t2+r10+416*10], m0
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m5, m17, 1
+ pavgw m5, m6 ; (b3 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p3
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*2+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10- 2]
+.hv1_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -68
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ palignr m3, m17, m16, 4
+ paddw m2, m1, m3
+ punpcklwd m0, m1, m3
+ pmaddwd m0, m0
+ punpckhwd m1, m3
+ pmaddwd m1, m1
+ palignr m3, m17, m16, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m5, m3, m6
+ vpdpwssd m0, m5, m5 ; h sumsq3
+ punpckhwd m3, m6
+ vpdpwssd m1, m3, m3
+ shufpd m3, m16, m17, 0x55
+ punpcklwd m5, m16, m3
+ paddw m4, m16, m3
+ punpckhwd m16, m3
+ paddw m17, m2, [t2+r10+416* 6]
+ mova [t2+r10+416* 6], m2
+ paddw m4, m2 ; h sum5
+ paddd m2, m0, [t2+r10+416* 8]
+ paddd m3, m1, [t2+r10+416*10]
+ mova [t2+r10+416* 8], m0
+ mova [t2+r10+416*10], m1
+ vpdpwssd m0, m5, m5 ; h sumsq5
+ vpdpwssd m1, m16, m16
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m16, m17, 1
+ pavgw m16, m6 ; (b3 + 2) >> 2
+ punpcklwd m5, m16, m6
+ vpdpwssd m2, m5, m5 ; -p3
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*4+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ paddw m5, m4, [t2+r10+416*0]
+ paddd m2, m0, [t2+r10+416*2]
+ paddd m3, m1, [t2+r10+416*4]
+ paddw m5, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m4
+ mova [t2+r10+416*2], m0
+ mova [t2+r10+416*4], m1
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m17, m5, 1
+ pavgw m17, m6 ; (b5 + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p5
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m5, m6 ; b5
+ punpckhwd m17, m5, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m16, m16
+ paddd m2, m2
+ paddd m3, m3
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*2+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m3, [t1+r10+416*0]
+ mova m4, [t1+r10+416*2]
+ mova m5, [t1+r10+416*4]
+ mova [t3+r10*2+416*8+ 8], m3
+ mova [t3+r10*2+416*0+ 8], m4
+ mova [t3+r10*2+416*0+72], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+416*0], m3
+ mova [t1+r10+416*2], m4
+ mova [t1+r10+416*4], m5
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*4+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m0, [t3+r10*2+416*8+ 8]
+ mova m4, [t3+r10*2+416*0+ 8]
+ mova m5, [t3+r10*2+416*0+72]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m4, [t2+r10+416*2]
+ paddd m3, m5, [t2+r10+416*4]
+ paddw m1, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m4
+ mova [t2+r10+416*4], m5
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m5, m1, 1
+ pavgw m5, m6 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p5
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m1, m6 ; b5
+ punpckhwd m17, m1, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu ym0, [t4+r10*1+416*0+2]
+ paddw ym2, ym0, [t4+r10*1+416*0+0]
+ paddw ym2, [t4+r10*1+416*0+4]
+ movu m1, [t3+r10*2+416*0+4]
+ paddd m3, m1, [t3+r10*2+416*0+0]
+ paddd m3, [t3+r10*2+416*0+8]
+ paddw ym0, ym2
+ paddd m1, m3
+ psllw ym2, 2
+ pslld m3, 2
+ paddw ym0, ym2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+416* 6], ym0
+ mova [t3+r10*2+416*12], m1
+ mova ym0, [t4+r10*1+416*2+0]
+ paddw ym0, [t4+r10*1+416*2+4]
+ paddw ym2, ym0, [t4+r10*1+416*2+2]
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m3, m1, [t3+r10*2+416*4+4]
+ psllw ym2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw ym2, ym0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+416* 8], ym2
+ mova [t3+r10*2+416*16], m3
+ mova ym0, [t4+r10*1+416*4+0]
+ paddw ym0, [t4+r10*1+416*4+4]
+ paddw ym2, ym0, [t4+r10*1+416*4+2]
+ mova m1, [t3+r10*2+416*8+0]
+ paddd m1, [t3+r10*2+416*8+8]
+ paddd m3, m1, [t3+r10*2+416*8+4]
+ psllw ym2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+416*10], ym2
+ mova [t3+r10*2+416*20], m3
+ psubw ym2, ym0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+416*12], ym2
+ mova [t3+r10*2+416*24], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu ym2, [t4+r10*1+2]
+ paddw ym0, ym2, [t4+r10*1+0]
+ paddw ym0, [t4+r10*1+4]
+ paddw ym2, ym0
+ psllw ym0, 2
+ paddw ym0, ym2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw ym2, ym0, [t4+r10*1+416* 6]
+ mova [t4+r10*1+416* 6], ym0
+ paddd m0, m4, [t3+r10*2+416*12]
+ mova [t3+r10*2+416*12], m4
+ mova ym3, [t4+r10*1+416*2+0]
+ paddw ym3, [t4+r10*1+416*2+4]
+ paddw ym5, ym3, [t4+r10*1+416*2+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416* 8]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416* 8], ym4
+ mova [t4+r10*1+416*10], ym5
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m5, m1, [t3+r10*2+416*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+416*16]
+ paddd m1, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*16], m4
+ mova [t3+r10*2+416*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, ym2 ; a5
+ pmovzxwd m3, ym3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ vpblendmb m0{k2}, m1, m0
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova ym3, [t4+r10*1+416*4+0]
+ paddw ym3, [t4+r10*1+416*4+4]
+ paddw ym5, ym3, [t4+r10*1+416*4+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416*12]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416*10], ym5
+ mova [t4+r10*1+416*12], ym4
+ mova m0, [t3+r10*2+416*8+0]
+ paddd m0, [t3+r10*2+416*8+8]
+ paddd m5, m0, [t3+r10*2+416*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m0 ; b3[ 1] 343
+ paddd m0, m4, [t3+r10*2+416*24]
+ paddd m0, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*20], m5
+ mova [t3+r10*2+416*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+416* 6]
+ pmovzxwd m3, ym3
+ mova m1, [t3+r10*2+416*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m1, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m0, m3 ; b3 - a3 * src + (1 << 8)
+ pslld m0, 7
+ vpalignr m0{k2}, m1, m1, 1
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration16_sse.asm b/third_party/dav1d/src/x86/looprestoration16_sse.asm
new file mode 100644
index 0000000000..872e502982
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_sse.asm
@@ -0,0 +1,3723 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_m14_m13: times 8 db -14,-13
+pb_m10_m9: times 8 db -10, -9
+pb_m6_m5: times 8 db -6, -5
+pb_m2_m1: times 8 db -2, -1
+pb_2_3: times 8 db 2, 3
+pb_6_7: times 8 db 6, 7
+pw_256: times 8 dw 256
+pw_1023: times 8 dw 1023
+pd_8: times 4 dd 8
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_m262128: times 4 dd -262128
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+pd_0xfffffff0: times 4 dd 0xfffffff0
+
+wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
+wiener_round: dd 1049600, 1048832
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 6
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 13*16
+ %else
+ %assign extra_stack 12*16
+ %endif
+cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \
+ dst, stride, left, lpf, w, flt
+ %if STACK_ALIGNMENT < 16
+ %define lpfm dword [esp+calloff+16*12+ 0]
+ %define wm dword [esp+calloff+16*12+ 4]
+ %define hd dword [esp+calloff+16*12+ 8]
+ %define edgeb byte [esp+calloff+16*12+12]
+ %define edged dword [esp+calloff+16*12+12]
+ %else
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %endif
+ %define PICmem dword [esp+calloff+4*0]
+ %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers
+ %define t1m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define t5m dword [esp+calloff+4*6]
+ %define t6m dword [esp+calloff+4*7]
+ %define t2 t2m
+ %define t3 t3m
+ %define t4 t4m
+ %define t5 t5m
+ %define t6 t6m
+ %define m8 [esp+calloff+16*2]
+ %define m9 [esp+calloff+16*3]
+ %define m10 [esp+calloff+16*4]
+ %define m11 [esp+calloff+16*5]
+ %define m12 [esp+calloff+16*6]
+ %define m13 [esp+calloff+16*7]
+ %define m14 [esp+calloff+16*8]
+ %define m15 [esp+calloff+16*9]
+ %define r10 r4
+ %define base t0-wiener_shifts
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov wd, [rstk+stack_offset+20]
+ mov wm, wd
+ mov r5, [rstk+stack_offset+24]
+ mov hd, r5
+ mov r5, [rstk+stack_offset+32]
+ mov edged, r5 ; edge
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
+cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ %define base
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ movq m13, [fltq]
+ movq m15, [fltq+16]
+%else
+ %if STACK_ALIGNMENT < 16
+ mov t0, [rstk+stack_offset+28]
+ mov t1, [rstk+stack_offset+36] ; pixel_max
+ movq m1, [t0] ; fx
+ movq m3, [t0+16] ; fy
+ LEA t0, wiener_shifts
+ %else
+ mov fltq, r6m
+ movq m1, [fltq]
+ movq m3, [fltq+16]
+ LEA t0, wiener_shifts
+ mov t1, r8m ; pixel_max
+ %endif
+ mov PICmem, t0
+%endif
+ mova m6, [base+wiener_shufA]
+ mova m7, [base+wiener_shufB]
+%if ARCH_X86_64
+ lea t4, [wiener_shifts]
+ add wd, wd
+ pshufd m12, m13, q0000 ; x0 x1
+ pshufd m13, m13, q1111 ; x2 x3
+ pshufd m14, m15, q0000 ; y0 y1
+ pshufd m15, m15, q1111 ; y2 y3
+ mova m8, [wiener_shufC]
+ mova m9, [wiener_shufD]
+ add lpfq, wq
+ lea t1, [rsp+wq+16]
+ add dstq, wq
+ neg wq
+ shr t3d, 11
+ %define base t4-wiener_shifts
+ movd m10, [base+wiener_round+t3*4]
+ movq m11, [base+wiener_shifts+t3*8]
+ pshufd m10, m10, q0000
+ pshufd m0, m11, q0000
+ pshufd m11, m11, q1111
+ pmullw m12, m0 ; upshift filter coefs to make the
+ pmullw m13, m0 ; horizontal downshift constant
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+ %define base
+ %define wiener_lshuf7_mem [wiener_lshuf7]
+ %define pd_m262128_mem [pd_m262128]
+%else
+ add wd, wd
+ mova m4, [base+wiener_shufC]
+ mova m5, [base+wiener_shufD]
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pshufd m2, m3, q0000
+ pshufd m3, m3, q1111
+ mova m8, m4
+ mova m9, m5
+ mova m14, m2
+ mova m15, m3
+ shr t1, 11
+ add lpfq, wq
+ mova m3, [base+pd_m262128]
+ movd m4, [base+wiener_round+t1*4]
+ movq m5, [base+wiener_shifts+t1*8]
+ lea t1, [esp+extra_stack+wq+16]
+ add dstq, wq
+ neg wq
+ pshufd m4, m4, q0000
+ pshufd m2, m5, q0000
+ pshufd m5, m5, q1111
+ mov wm, wq
+ pmullw m0, m2
+ pmullw m1, m2
+ mova m2, [base+wiener_lshuf7]
+ %define pd_m262128_mem [esp+calloff+16*10]
+ mova pd_m262128_mem, m3
+ mova m10, m4
+ mova m11, m5
+ mova m12, m0
+ mova m13, m1
+ %define wiener_lshuf7_mem [esp+calloff+16*11]
+ mova wiener_lshuf7_mem, m2
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov lpfm, r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, lpfm
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+ movif32 wq, wm
+.v2:
+ call .v
+ movif32 wq, wm
+ jmp .v1
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movif32 t0, PICmem
+ pxor m0, m0
+ movd m1, wd
+ mova m2, [base+pb_0to15]
+ pshufb m1, m0
+ mova m0, [base+pb_6_7]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m3, m0
+ mova m0, [base+pb_m2_m1]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m4, m0
+ mova m0, [base+pb_m10_m9]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m5, m0
+ movif32 t0, t0m
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq m3, [leftq]
+ movhps m3, [lpfq+wq]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m3, [lpfq+wq] ; avoid accessing memory located
+ pshufb m3, wiener_lshuf7_mem ; before the start of the buffer
+ jmp .h_main
+.h_top:
+ movif64 wq, r4
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+wq-8]
+.h_main:
+ mova m4, [lpfq+wq+0]
+ movu m5, [lpfq+wq+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+wq], m0
+ add wq, 16
+ jl .h_loop
+ movif32 wq, wm
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq m3, [leftq]
+ movhps m3, [lpfq+wq]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m3, [lpfq+wq]
+ pshufb m3, wiener_lshuf7_mem
+ jmp .hv_main
+.hv_bottom:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+wq-8]
+.hv_main:
+ mova m4, [lpfq+wq+0]
+ movu m5, [lpfq+wq+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -20
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t1, t4m
+ movif32 t0, t2m
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ mova m2, pd_m262128_mem
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+%if ARCH_X86_64
+ mova m2, [t4+wq]
+ paddw m2, [t2+wq]
+ mova m5, [t3+wq]
+%else
+ mova m2, [t1+wq]
+ paddw m2, [t0+wq]
+ mov t1, t3m
+ mov t0, t5m
+ mova m5, [t1+wq]
+ mov t1, t1m
+%endif
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+%if ARCH_X86_64
+ mova m4, [t5+wq]
+ paddw m4, [t1+wq]
+ psraw m0, 1
+ paddw m3, m0, [t6+wq]
+%else
+ mova m4, [t0+wq]
+ paddw m4, [t1+wq]
+ mov t0, t0m
+ mov t1, t6m
+ psraw m0, 1
+ paddw m3, m0, [t1+wq]
+%endif
+ mova [t0+wq], m0
+ punpcklwd m0, m2, m5
+ pmaddwd m0, m15
+ punpckhwd m2, m5
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m4
+ pmaddwd m1, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m2, m10
+ paddd m0, m1
+ paddd m2, m3
+ psrad m0, 6
+ psrad m2, 6
+ packssdw m0, m2
+ pmulhw m0, m11
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .hv_loop
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov r4, t5m
+ mov t1, t4m
+ mov t6m, r4
+ mov t5m, t1
+ mov r4, t3m
+ mov t1, t2m
+ mov t4m, r4
+ mov t3m, t1
+ mov r4, t1m
+ mov t1, t0
+ mov t2m, r4
+ mov t0, t6m
+ mov wq, wm
+%endif
+ add dstq, strideq
+ ret
+.v:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+wq]
+ paddw m1, [t2+wq]
+ mova m2, [t3+wq]
+ mova m4, [t1+wq]
+ paddw m3, m4, [t6+wq]
+ paddw m4, [t5+wq]
+%else
+ mov t0, t4m
+ mov t1, t2m
+ mova m1, [t0+wq]
+ paddw m1, [t1+wq]
+ mov t0, t3m
+ mov t1, t1m
+ mova m2, [t0+wq]
+ mova m4, [t1+wq]
+ mov t0, t6m
+ mov t1, t5m
+ paddw m3, m4, [t0+wq]
+ paddw m4, [t1+wq]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m15
+ punpckhwd m1, m2
+ pmaddwd m1, m15
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m1, m10
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pmulhw m0, m11
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .v_loop
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+%else
+ mov t0, t5m
+ mov t1, t4m
+ mov r4, t3m
+ mov t6m, t0
+ mov t5m, t1
+ mov t4m, r4
+ mov r4, t2m
+ mov t1, t1m
+ mov t0, t0m
+ mov t3m, r4
+ mov t2m, t1
+%endif
+ add dstq, strideq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign stack_size 12*16+384*8
+ %else
+ %assign stack_size 11*16+384*8
+ %endif
+cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \
+ lpf, w, flt
+ %if STACK_ALIGNMENT < 16
+ %define lpfm dword [esp+calloff+4*6]
+ %define wm dword [esp+calloff+4*7]
+ %define hd dword [esp+calloff+16*10+0]
+ %define edgeb byte [esp+calloff+16*10+4]
+ %define edged dword [esp+calloff+16*10+4]
+ %else
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %endif
+ %define PICmem dword [esp+calloff+4*0]
+ %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers
+ %define t1m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define t2 t2m
+ %define t3 t3m
+ %define t4 t4m
+ %define m8 [esp+calloff+16*2]
+ %define m9 [esp+calloff+16*3]
+ %define m10 [esp+calloff+16*4]
+ %define m11 [esp+calloff+16*5]
+ %define m12 [esp+calloff+16*6]
+ %define m13 [esp+calloff+16*7]
+ %define m14 [esp+calloff+16*8]
+ %define m15 [esp+calloff+16*9]
+ %define base t0-wiener_shifts
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov wd, [rstk+stack_offset+20]
+ mov wm, wd
+ mov r5, [rstk+stack_offset+24]
+ mov hd, r5
+ mov r5, [rstk+stack_offset+32]
+ mov edged, r5 ; edge
+ %endif
+%else
+cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ %define base
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ movq m12, [fltq]
+ movq m14, [fltq+16]
+%else
+ %if STACK_ALIGNMENT < 16
+ mov t0, [rstk+stack_offset+28]
+ mov t1, [rstk+stack_offset+36] ; pixel_max
+ movq m1, [t0] ; fx
+ movq m3, [t0+16] ; fy
+ LEA t0, wiener_shifts
+ %else
+ mov fltq, r6m
+ movq m1, [fltq]
+ movq m3, [fltq+16]
+ LEA t0, wiener_shifts
+ mov t1, r8m ; pixel_max
+ %endif
+ mov PICmem, t0
+%endif
+ mova m5, [base+wiener_shufE]
+ mova m6, [base+wiener_shufB]
+ mova m7, [base+wiener_shufD]
+%if ARCH_X86_64
+ lea t4, [wiener_shifts]
+ add wd, wd
+ punpcklwd m11, m12, m12
+ pshufd m11, m11, q1111 ; x1
+ pshufd m12, m12, q1111 ; x2 x3
+ punpcklwd m13, m14, m14
+ pshufd m13, m13, q1111 ; y1
+ pshufd m14, m14, q1111 ; y2 y3
+ shr t3d, 11
+ mova m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ lea t1, [rsp+wq+16]
+ add dstq, wq
+ neg wq
+ %define base t4-wiener_shifts
+ movd m9, [base+wiener_round+t3*4]
+ movq m10, [base+wiener_shifts+t3*8]
+ pshufd m9, m9, q0000
+ pshufd m0, m10, q0000
+ pshufd m10, m10, q1111
+ mova m15, [wiener_lshuf5]
+ pmullw m11, m0
+ pmullw m12, m0
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+ %define base
+%else
+ add wd, wd
+ punpcklwd m0, m1, m1
+ pshufd m0, m0, q1111 ; x1
+ pshufd m1, m1, q1111 ; x2 x3
+ punpcklwd m2, m3, m3
+ pshufd m2, m2, q1111 ; y1
+ pshufd m3, m3, q1111 ; y2 y3
+ mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
+ mova m13, m2
+ mova m14, m3
+ mova m8, m4
+ shr t1, 11
+ add lpfq, wq
+ movd m2, [base+wiener_round+t1*4]
+ movq m3, [base+wiener_shifts+t1*8]
+ %if STACK_ALIGNMENT < 16
+ lea t1, [esp+16*11+wq+16]
+ %else
+ lea t1, [esp+16*10+wq+16]
+ %endif
+ add dstq, wq
+ neg wq
+ pshufd m2, m2, q0000
+ pshufd m4, m3, q0000
+ pshufd m3, m3, q1111
+ mov wm, wq
+ pmullw m0, m4
+ pmullw m1, m4
+ mova m4, [base+wiener_lshuf5]
+ mova m9, m2
+ mova m10, m3
+ mova m11, m0
+ mova m12, m1
+ mova m15, m4
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov lpfm, r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, lpfm
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+%if ARCH_X86_64
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+%else
+ mov t0, t3m
+ mov r4, t2m
+ mov t1, t1m
+ mov t4m, t0
+ mov t3m, r4
+ mov t2m, t1
+ mov wq, wm
+%endif
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movif32 t0, PICmem
+ pxor m1, m1
+ movd m2, wd
+ mova m0, [base+pb_2_3]
+ pshufb m2, m1
+ mova m1, [base+pb_m6_m5]
+ psubb m0, m2
+ psubb m1, m2
+ mova m2, [base+pb_0to15]
+ pminub m0, m2
+ pminub m1, m2
+ pshufb m3, m0
+ pshufb m4, m1
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ mova m4, [lpfq+wq]
+ movd m3, [leftq+4]
+ pslldq m4, 4
+ por m3, m4
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m3, [lpfq+wq] ; avoid accessing memory located
+ pshufb m3, m15 ; before the start of the buffer
+ jmp .h_main
+.h_top:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+wq-4]
+.h_main:
+ movu m4, [lpfq+wq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+wq], m0
+ add wq, 16
+ jl .h_loop
+ movif32 wq, wm
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ mova m4, [lpfq+wq]
+ movd m3, [leftq+4]
+ pslldq m4, 4
+ por m3, m4
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m3, [lpfq+wq]
+ pshufb m3, m15
+ jmp .hv_main
+.hv_bottom:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+wq-4]
+.hv_main:
+ movu m4, [lpfq+wq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t1, t1m
+ movif32 t0, t3m
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+%if ARCH_X86_64
+ mova m2, [t3+wq]
+ paddw m2, [t1+wq]
+ paddd m1, m3
+ mova m4, [t2+wq]
+%else
+ mova m2, [t0+wq]
+ mov t0, t2m
+ paddw m2, [t1+wq]
+ mov t1, t4m
+ paddd m1, m3
+ mova m4, [t0+wq]
+ mov t0, t0m
+%endif
+ punpckhwd m3, m2, m4
+ pmaddwd m3, m14
+ punpcklwd m2, m4
+%if ARCH_X86_64
+ mova m4, [t4+wq]
+%else
+ mova m4, [t1+wq]
+%endif
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ pmaddwd m2, m14
+ psraw m0, 1
+ mova [t0+wq], m0
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 6
+ psrad m0, 6
+ packssdw m0, m1
+ pmulhw m0, m10
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .hv_loop
+%if ARCH_X86_64
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+%else
+ mov r4, t3m
+ mov t1, t2m
+ mov t4m, r4
+ mov t3m, t1
+ mov r4, t1m
+ mov t1, t0
+ mov t2m, r4
+ mov t0, t4m
+ mov wq, wm
+%endif
+ add dstq, strideq
+ ret
+.v:
+ movif64 wq, r4
+ movif32 t1m, t1
+.v_loop:
+%if ARCH_X86_64
+ mova m0, [t1+wq]
+ paddw m2, m0, [t3+wq]
+ mova m1, [t2+wq]
+ mova m4, [t4+wq]
+%else
+ mov t0, t3m
+ mova m0, [t1+wq]
+ mov t1, t2m
+ paddw m2, m0, [t0+wq]
+ mov t0, t4m
+ mova m1, [t1+wq]
+ mova m4, [t0+wq]
+%endif
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 6
+ psrad m0, 6
+ packssdw m0, m1
+ pmulhw m0, m10
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+%if ARCH_X86_64
+ jl .v_loop
+%else
+ jge .v_end
+ mov t1, t1m
+ jmp .v_loop
+.v_end:
+%endif
+ ret
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MAXSD 3-4 0 ; dst, src, restore_tmp
+ pcmpgtd %3, %1, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+ %if %4 == 1
+ pxor %3, %3
+ %endif
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pd_8]
+ %define m9 [base+pd_0xfffffff0]
+ %define m10 [esp+calloff+16*2]
+ %define m11 [base+pd_0xf00800a4]
+ %define m12 [base+sgr_lshuf5]
+ %define m13 [base+pd_34816]
+ %define m14 [base+pw_1023]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ movu m10, [paramsq]
+ mova m12, [sgr_lshuf5]
+ add lpfq, wq
+ mova m8, [pd_8]
+ lea t1, [rsp+wq+20]
+ mova m9, [pd_0xfffffff0]
+ add dstq, wq
+ lea t3, [rsp+wq*2+400*12+16]
+ mova m11, [pd_0xf00800a4]
+ lea t4, [rsp+wq+400*20+16]
+ pshufhw m7, m10, q0000
+ pshufb m10, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pd_34816] ; (1 << 11) + (1 << 15)
+ pxor m6, m6
+ mova m14, [pw_1023]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m10, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 4
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ movd m0, wd
+ movd m1, [lpfq-2]
+ mova m2, [base+pw_256]
+ mova m3, [base+pb_m14_m13]
+ pshufb m0, m6
+ pshufb m1, m2
+ psubb m2, m0
+ psubb m3, m0
+ mova m0, [base+pb_0to15]
+ pcmpgtb m2, m0
+ pcmpgtb m3, m0
+ pand m4, m2
+ pand m5, m3
+ pandn m2, m1
+ pandn m3, m1
+ por m4, m2
+ por m5, m3
+ ret
+%assign stack_offset stack_offset+4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq- 2]
+.h_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq+400*0]
+ paddd m1, [t1+wq+400*2]
+ paddd m2, [t1+wq+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq+400*0], m0
+ mova [t1+wq+400*2], m1
+ mova [t1+wq+400*4], m2
+ add wq, 16
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq+400*0]
+ mova m1, [t1+wq+400*2]
+ mova m2, [t1+wq+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m1
+ mova [t2+wq+400*4], m2
+ add wq, 16
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -20
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq+400*0]
+ paddd m4, m2, [t1+wq+400*2]
+ paddd m5, m3, [t1+wq+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq+400*0] ; hv sum
+ paddd m4, [t2+wq+400*2] ; hv sumsq
+ paddd m5, [t2+wq+400*4]
+ mova [t0+wq+400*0], m0
+ mova [t0+wq+400*2], m2
+ mova [t0+wq+400*4], m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ pand m4, m9 ; ((a + 8) >> 4) << 4
+ pand m5, m9
+ psrld m2, m4, 4
+ psrld m0, m5, 4
+ paddd m2, m4
+ psrld m4, 1
+ paddd m0, m5
+ psrld m5, 1
+ paddd m4, m2 ; a * 25
+ paddd m5, m0
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m6
+ MAXSD m5, m3, m6, 1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m10, m2 ; p * s
+ MULLD m5, m10, m2
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+wq+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq+400*0], m1
+ paddw m1, m0
+ mova [t1+wq+400*2], m4
+ paddd m4, m2
+ mova [t1+wq+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq+400*0]
+ mova m2, [t1+wq+400*2]
+ mova m3, [t1+wq+400*4]
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m4, m2, [t2+wq+400*2]
+ paddd m5, m3, [t2+wq+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ pand m4, m9 ; ((a + 8) >> 4) << 4
+ pand m5, m9
+ psrld m2, m4, 4
+ psrld m0, m5, 4
+ paddd m2, m4
+ psrld m4, 1
+ paddd m0, m5
+ psrld m5, 1
+ paddd m4, m2 ; a * 25
+ paddd m5, m0
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m6
+ MAXSD m5, m3, m6, 1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m10, m2 ; p * s
+ MULLD m5, m10, m2
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+wq+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+ 2]
+ movu m3, [t4+wq*1+ 4]
+ movu m1, [t3+wq*2+ 4]
+ movu m4, [t3+wq*2+ 8]
+ movu m2, [t3+wq*2+20]
+ movu m5, [t3+wq*2+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*1+ 0]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*1+400*2+ 0], m0
+ mova [t3+wq*2+400*4+ 0], m1
+ mova [t3+wq*2+400*4+16], m2
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*1+ 2]
+ movu m3, [t4+wq*1+ 4]
+ movu m1, [t3+wq*2+ 4]
+ movu m4, [t3+wq*2+ 8]
+ movu m2, [t3+wq*2+20]
+ movu m5, [t3+wq*2+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*1+ 0]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*1+400*2+ 0]
+ paddd m4, m1, [t3+wq*2+400*4+ 0]
+ paddd m5, m2, [t3+wq*2+400*4+16]
+ mova [t4+wq*1+400*2+ 0], m0
+ mova [t3+wq*2+400*4+ 0], m1
+ mova [t3+wq*2+400*4+16], m2
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ mova m0, [dstq+wq]
+ mova m3, [t4+wq*1+400*2+ 0]
+ mova m4, [t3+wq*2+400*4+ 0]
+ mova m5, [t3+wq*2+400*4+16]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pd_8]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 [base+sgr_lshuf3]
+ %define m13 [base+pw_1023]
+ %define m14 m6
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq+12]
+ mova m8, [pd_8]
+ add dstq, wq
+ lea t3, [rsp+wq*2+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [pw_1023]
+ psllw m7, 4
+ mova m12, [sgr_lshuf3]
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 4
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq+400*0]
+ mova m1, [t1+wq+400*2]
+ mova m2, [t1+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m1
+ mova [t2+wq+400*4], m2
+ add wq, 16
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+ movd m1, wd
+ movd m5, [lpfq-2]
+ mova m2, [base+pw_256]
+ mova m3, [base+pb_0to15]
+ pshufb m1, m6
+ pshufb m5, m2
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m4, m2
+ pandn m2, m5
+ por m4, m2
+ ret
+%assign stack_offset stack_offset+4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq+ 0]
+.h_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq+400*0], m1
+ mova [t1+wq+400*2], m2
+ mova [t1+wq+400*4], m3
+ add wq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m4, [lpfq+wq+ 0]
+.hv0_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -18
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq+400*0]
+ paddd m4, m2, [t1+wq+400*2]
+ paddd m5, m3, [t1+wq+400*4]
+ mova [t1+wq+400*0], m1
+ mova [t1+wq+400*2], m2
+ mova [t1+wq+400*4], m3
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m4, [lpfq+wq+ 0]
+.hv1_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -18
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m4, m2, [t2+wq+400*2]
+ paddd m5, m3, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m2
+ mova [t2+wq+400*4], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ psrld m5, 4
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*0+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m1
+ add wq, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+400*0+ 4]
+ movu m1, [t3+wq*2+400*0+ 8]
+ movu m2, [t3+wq*2+400*0+24]
+ movu m3, [t4+wq*1+400*0+ 2]
+ movu m4, [t3+wq*2+400*0+ 4]
+ movu m5, [t3+wq*2+400*0+20]
+ paddw m0, [t4+wq*1+400*0+ 0]
+ paddd m1, [t3+wq*2+400*0+ 0]
+ paddd m2, [t3+wq*2+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*1+400*4], m3
+ mova [t3+wq*2+400*8+ 0], m4
+ mova [t3+wq*2+400*8+16], m5
+ movu m0, [t4+wq*1+400*2+ 4]
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m2, [t3+wq*2+400*4+24]
+ movu m3, [t4+wq*1+400*2+ 2]
+ movu m4, [t3+wq*2+400*4+ 4]
+ movu m5, [t3+wq*2+400*4+20]
+ paddw m0, [t4+wq*1+400*2+ 0]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m2, [t3+wq*2+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*1+400* 6], m3
+ mova [t3+wq*2+400*12+ 0], m4
+ mova [t3+wq*2+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*1+400* 8], m3
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m5
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*1+400*0+4]
+ movu m1, [t4+wq*1+400*0+2]
+ paddw m3, [t4+wq*1+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*1+400*4]
+ paddw m3, [t4+wq*1+400*6]
+ mova [t4+wq*1+400*4], m2
+ mova [t4+wq*1+400*6], m1
+ movu m4, [t3+wq*2+400*0+8]
+ movu m1, [t3+wq*2+400*0+4]
+ paddd m4, [t3+wq*2+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*2+400* 8+ 0]
+ paddd m4, [t3+wq*2+400*12+ 0]
+ mova [t3+wq*2+400* 8+ 0], m2
+ mova [t3+wq*2+400*12+ 0], m1
+ movu m5, [t3+wq*2+400*0+24]
+ movu m1, [t3+wq*2+400*0+20]
+ paddd m5, [t3+wq*2+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*2+400* 8+16]
+ paddd m5, [t3+wq*2+400*12+16]
+ mova [t3+wq*2+400* 8+16], m2
+ mova [t3+wq*2+400*12+16], m1
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*1+400*2+4]
+ movu m1, [t4+wq*1+400*2+2]
+ paddw m3, [t4+wq*1+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*1+400*6]
+ paddw m3, [t4+wq*1+400*8]
+ mova [t4+wq*1+400*6], m1
+ mova [t4+wq*1+400*8], m2
+ movu m4, [t3+wq*2+400*4+8]
+ movu m1, [t3+wq*2+400*4+4]
+ paddd m4, [t3+wq*2+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*2+400*12+ 0]
+ paddd m4, [t3+wq*2+400*16+ 0]
+ mova [t3+wq*2+400*12+ 0], m1
+ mova [t3+wq*2+400*16+ 0], m2
+ movu m5, [t3+wq*2+400*4+24]
+ movu m1, [t3+wq*2+400*4+20]
+ paddd m5, [t3+wq*2+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*2+400*12+16]
+ paddd m5, [t3+wq*2+400*16+16]
+ mova [t3+wq*2+400*12+16], m1
+ mova [t3+wq*2+400*16+16], m2
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_8]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ mova m14, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_8]
+ lea t1, [rsp+wq+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ mova m11, [pd_0xf00801c7]
+ lea t3, [rsp+wq*2+400*24+40]
+ mova m12, [pd_0xf00800a4]
+ lea t4, [rsp+wq+400*52+40]
+ neg wq
+ pshufd m15, m14, q2222 ; w0 w1
+ punpcklwd m14, m14
+ pshufd m13, m14, q0000 ; s0
+ pshufd m14, m14, q2222 ; s1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 4
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq+400* 0]
+ mova m1, [t1+wq+400* 2]
+ mova m2, [t1+wq+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq+400*10]
+ mova [t2+wq+400* 0], m0
+ mova [t2+wq+400* 2], m1
+ mova [t2+wq+400* 4], m2
+ mova [t2+wq+400* 6], m3
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ add wq, 16
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+%assign stack_offset stack_offset+4
+%assign calloff 4
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq- 2]
+.h_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.h_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m6
+ pmaddwd m7, m7
+ punpckhwd m0, m6
+ pmaddwd m0, m0
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq+400* 6], m1
+ mova [t1+wq+400* 8], m2
+ mova [t1+wq+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq+400* 0], m8
+ mova [t1+wq+400* 2], m7
+ mova [t1+wq+400* 4], m5
+ add wq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv0_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -20
+ jl .hv0_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.hv0_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m6
+ pmaddwd m7, m7
+ punpckhwd m0, m6
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*2+400*8+ 8], m8
+ mova [t3+wq*2+400*0+ 8], m7
+ mova [t3+wq*2+400*0+24], m5
+ paddw m8, [t1+wq+400* 0]
+ paddd m7, [t1+wq+400* 2]
+ paddd m5, [t1+wq+400* 4]
+ mova [t1+wq+400* 0], m8
+ mova [t1+wq+400* 2], m7
+ mova [t1+wq+400* 4], m5
+ paddw m0, m1, [t1+wq+400* 6]
+ paddd m4, m2, [t1+wq+400* 8]
+ paddd m5, m3, [t1+wq+400*10]
+ mova [t1+wq+400* 6], m1
+ mova [t1+wq+400* 8], m2
+ mova [t1+wq+400*10], m3
+ paddw m1, m0, [t2+wq+400* 6]
+ paddd m2, m4, [t2+wq+400* 8]
+ paddd m3, m5, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m0
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv1_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -20
+ jl .hv1_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.hv1_have_right:
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m6
+ pmaddwd m1, m1
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq+400* 6]
+ mova [t2+wq+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq+400* 8]
+ paddd m3, m7, [t2+wq+400*10]
+ mova [t2+wq+400* 8], m0
+ mova [t2+wq+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; ((a3 + 8) >> 4) * 9
+ paddd m3, m7
+ psrlw m7, m5, 1
+ pavgw m7, m6 ; (b3 + 2) >> 2
+ punpcklwd m0, m7, m6
+ pmaddwd m0, m0
+ punpckhwd m7, m6
+ pmaddwd m7, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+%else
+ SWAP m8, m6
+%endif
+ MAXSD m2, m0, m8
+ MAXSD m3, m7, m8
+ pxor m8, m8
+ psubd m2, m0 ; p3
+ psubd m3, m7
+ punpcklwd m0, m5, m8 ; b3
+ punpckhwd m5, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*1+400*4+4], m8
+ mova [t3+wq*2+400*8+ 8], m0
+ mova [t3+wq*2+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m1, [t2+wq+400*4]
+ paddw m5, [t1+wq+400*0]
+ paddd m2, [t1+wq+400*2]
+ paddd m3, [t1+wq+400*4]
+ mova [t2+wq+400*0], m8
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ mova [t2+wq+400*2], m4
+ pslld m8, m2, 4
+ mova [t2+wq+400*4], m1
+ pslld m4, m3, 4
+ paddd m8, m2
+ pslld m2, 3
+ paddd m4, m3
+ pslld m3, 3
+ paddd m2, m8 ; ((a5 + 8) >> 4) * 25
+ paddd m3, m4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ psrlw m1, m5, 1
+ pavgw m1, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m1, m7
+ pmaddwd m4, m4
+ punpckhwd m1, m7
+ pmaddwd m1, m1
+ punpcklwd m0, m5, m7 ; b5
+ punpckhwd m5, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m2, m4, m7
+ psubd m2, m4 ; p5
+ MAXSD m3, m1, m7
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*1+400*0+ 4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m5
+ add wq, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq+400* 6]
+ mova m4, [t1+wq+400* 8]
+ mova m5, [t1+wq+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq+400* 6]
+ paddd m2, m4, [t2+wq+400* 8]
+ paddd m3, m5, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m0
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ mova [t3+wq*2+400*8+ 8], m3
+ mova [t3+wq*2+400*0+ 8], m4
+ mova [t3+wq*2+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq+400*0], m3
+ mova [t1+wq+400*2], m4
+ mova [t1+wq+400*4], m5
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq+400* 6]
+ mova m5, [t1+wq+400* 8]
+ mova m7, [t1+wq+400*10]
+ paddw m1, m4, [t2+wq+400* 6]
+ paddd m2, m5, [t2+wq+400* 8]
+ paddd m3, m7, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m4
+ mova [t2+wq+400* 8], m5
+ mova [t2+wq+400*10], m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*2+400*8+ 8]
+ mova m5, [t3+wq*2+400*0+ 8]
+ mova m7, [t3+wq*2+400*0+24]
+ paddw m1, m4, [t2+wq+400*0]
+ paddd m2, m5, [t2+wq+400*2]
+ paddd m3, m7, [t2+wq+400*4]
+ paddw m1, [t1+wq+400*0]
+ paddd m2, [t1+wq+400*2]
+ paddd m3, [t1+wq+400*4]
+ mova [t2+wq+400*0], m4
+ mova [t2+wq+400*2], m5
+ mova [t2+wq+400*4], m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ mova [t3+wq*2+400*8+ 8], m0
+ pslld m4, m2, 4
+ mova [t3+wq*2+400*8+24], m8
+ pslld m5, m3, 4
+ paddd m4, m2
+ pslld m2, 3
+ paddd m5, m3
+ pslld m3, 3
+ paddd m2, m4
+ paddd m3, m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m2, m4, m7
+ psubd m2, m4 ; p5
+ MAXSD m3, m5, m7
+ psubd m3, m5
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*0+ 4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m1
+ add wq, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+400*0+ 2]
+ movu m1, [t3+wq*2+400*0+ 4]
+ movu m2, [t3+wq*2+400*0+20]
+ movu m7, [t4+wq*1+400*0+ 4]
+ movu m8, [t3+wq*2+400*0+ 8]
+ paddw m3, m0, [t4+wq*1+400*0+ 0]
+ paddd m4, m1, [t3+wq*2+400*0+ 0]
+ paddd m5, m2, [t3+wq*2+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*2+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*1+400* 6+ 0], m0
+ mova [t3+wq*2+400*12+ 0], m1
+ mova [t3+wq*2+400*12+16], m2
+ movu m0, [t4+wq*1+400*2+ 4]
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m2, [t3+wq*2+400*4+24]
+ movu m3, [t4+wq*1+400*2+ 2]
+ movu m4, [t3+wq*2+400*4+ 4]
+ movu m5, [t3+wq*2+400*4+20]
+ paddw m0, [t4+wq*1+400*2+ 0]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m2, [t3+wq*2+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*1+400* 8+ 0], m3
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m5
+ movu m0, [t4+wq*1+400*4+ 4]
+ movu m1, [t3+wq*2+400*8+ 8]
+ movu m2, [t3+wq*2+400*8+24]
+ movu m3, [t4+wq*1+400*4+ 2]
+ movu m4, [t3+wq*2+400*8+ 4]
+ movu m5, [t3+wq*2+400*8+20]
+ paddw m0, [t4+wq*1+400*4+ 0]
+ paddd m1, [t3+wq*2+400*8+ 0]
+ paddd m2, [t3+wq*2+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*1+400*10+ 0], m3
+ mova [t3+wq*2+400*20+ 0], m4
+ mova [t3+wq*2+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*1+400*12+ 0], m3
+ mova [t3+wq*2+400*24+ 0], m4
+ mova [t3+wq*2+400*24+16], m5
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*1+ 4]
+ movu m2, [t4+wq*1+ 2]
+ paddw m0, [t4+wq*1+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*2+ 8]
+ movu m5, [t3+wq*2+24]
+ movu m1, [t3+wq*2+ 4]
+ movu m3, [t3+wq*2+20]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*1+400* 6]
+ paddw m2, m0
+ mova [t4+wq*1+400* 6], m0
+ paddd m0, m4, [t3+wq*2+400*12+ 0]
+ paddd m1, m5, [t3+wq*2+400*12+16]
+ mova [t3+wq*2+400*12+ 0], m4
+ mova [t3+wq*2+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*1+400*2+4]
+ movu m5, [t4+wq*1+400*2+2]
+ paddw m3, [t4+wq*1+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*1+400* 8]
+ paddw m3, [t4+wq*1+400*10]
+ paddw m3, m4
+ mova [t4+wq*1+400* 8], m4
+ mova [t4+wq*1+400*10], m5
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m5, [t3+wq*2+400*4+ 4]
+ movu m7, [t3+wq*2+400*4+24]
+ movu m8, [t3+wq*2+400*4+20]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m7, [t3+wq*2+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*2+400*16+ 0]
+ paddd m7, m8, [t3+wq*2+400*16+16]
+ paddd m1, [t3+wq*2+400*20+ 0]
+ paddd m7, [t3+wq*2+400*20+16]
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m8
+ mova [t3+wq*2+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*2+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movu m5, [dstq+wq]
+ punpcklwd m4, m5, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m4 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m4 ; a3 * src
+ punpckhwd m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ pslld m4, 13
+ pslld m5, 13
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8)
+ mova m7, [base+pd_0xffff]
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m7
+ pandn m8, m7, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ mova m8, [rsp+32+ARCH_X86_32*4]
+ psubd m1, m2
+ psubd m8, m3
+ mova m2, [base+pd_4096]
+ psrld m1, 9
+ pslld m8, 7
+ pand m1, m7
+ pandn m7, m8
+ por m1, m7
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ paddd m4, m2
+ paddd m5, m2
+ paddd m0, m4
+ paddd m1, m5
+ psrad m0, 8
+ psrad m1, 8
+ packssdw m0, m1 ; clip
+ pmaxsw m0, m7
+ psrlw m0, 5
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+%if ARCH_X86_64
+ SWAP m6, m7
+%endif
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*1+400*4+4]
+ movu m5, [t4+wq*1+400*4+2]
+ paddw m3, [t4+wq*1+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*1+400*12]
+ paddw m3, [t4+wq*1+400*10]
+ mova [t4+wq*1+400*10], m5
+ mova [t4+wq*1+400*12], m4
+ movu m1, [t3+wq*2+400*8+ 8]
+ movu m5, [t3+wq*2+400*8+ 4]
+ movu m7, [t3+wq*2+400*8+24]
+ movu m8, [t3+wq*2+400*8+20]
+ paddd m1, [t3+wq*2+400*8+ 0]
+ paddd m7, [t3+wq*2+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*2+400*24+ 0]
+ paddd m7, m0, [t3+wq*2+400*24+16]
+ paddd m1, [t3+wq*2+400*20+ 0]
+ paddd m7, [t3+wq*2+400*20+16]
+ mova [t3+wq*2+400*20+ 0], m5
+ mova [t3+wq*2+400*20+16], m8
+ mova [t3+wq*2+400*24+ 0], m4
+ mova [t3+wq*2+400*24+16], m0
+ mova m5, [dstq+wq]
+ mova m2, [t4+wq*1+400* 6]
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8)
+ pslld m4, 13
+ pslld m5, 13
+ mova m0, [t3+wq*2+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8)
+ mova m8, [t3+wq*2+400*12+16]
+ psubd m8, m2
+ psubd m7, m3
+ mova m2, [base+pd_0xffff]
+ pslld m1, 7
+ psrld m0, 8
+ psrld m8, 8
+ pslld m7, 7
+ pand m0, m2
+ pandn m3, m2, m1
+ por m0, m3
+ pand m8, m2
+ pandn m2, m7
+ por m2, m8
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ pxor m7, m7
+ paddd m4, m1
+ paddd m5, m1
+ paddd m0, m4
+ paddd m2, m5
+ psrad m0, 8
+ psrad m2, 8
+ packssdw m0, m2 ; clip
+ pmaxsw m0, m7
+ psrlw m0, 5
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
diff --git a/third_party/dav1d/src/x86/looprestoration_avx2.asm b/third_party/dav1d/src/x86/looprestoration_avx2.asm
new file mode 100644
index 0000000000..a73cb21882
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_avx2.asm
@@ -0,0 +1,2237 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+sgr_r_ext: times 16 db 1
+ times 16 db 9
+
+; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of
+; cache but eliminates some shifts in the inner sgr loop which is overall a win
+const sgr_x_by_x_avx2
+ dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
+ dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8
+ dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5
+ dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
+ dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3
+ dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+ dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+
+ times 4 db -1 ; needed for 16-bit sgr
+pb_m5: times 4 db -5
+pb_3: times 4 db 3
+pw_5_6: dw 5, 6
+
+sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
+ db 9, -1, 10, -1, 11, -1, 12, -1
+
+pw_256: times 2 dw 256
+pw_2056: times 2 dw 2056
+pw_m16380: times 2 dw -16380
+pd_25: dd 25
+pd_34816: dd 34816
+pd_m4096: dd -4096
+pd_0xf00801c7: dd 0xf00801c7
+pd_0xf00800a4: dd 0xf00800a4
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_YMM avx2
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov wd, wm
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastb m11, [fltq+ 0] ; x0 x0
+ vbroadcasti128 m7, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m8, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m9, [sgr_shuf+6]
+ add lpfq, wq
+ vpbroadcastd m10, [pw_m16380]
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq*2+16]
+ psllw m14, 5
+ neg wq
+ psllw m15, 5
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_3]
+ vpbroadcastd m1, [pb_m5]
+ vpbroadcastb m2, xm2
+ movu m3, [pb_0to31]
+ psubb m0, m2
+ psubb m1, m2
+ pminub m0, m3
+ pminub m1, m3
+ pshufb m4, m0
+ pshufb m5, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ vpbroadcastd m2, [pw_2056]
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m2
+ paddw m1, m2
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, [wiener_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t4+r10*2]
+ paddw m2, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ paddsw m0, m4
+ vpbroadcastd m4, [pw_2056]
+ paddsw m1, m5
+ mova m5, [t5+r10*2]
+ paddw m5, [t1+r10*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m4, m0, [t6+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t4+r10*2+32]
+ paddw m2, [t2+r10*2+32]
+ mova m3, [t3+r10*2+32]
+ mova m5, [t5+r10*2+32]
+ paddw m5, [t1+r10*2+32]
+ packuswb m0, m4
+ paddw m4, m1, [t6+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10*2+ 0]
+ paddw m2, [t2+r10*2+ 0]
+ mova m4, [t3+r10*2+ 0]
+ mova m6, [t1+r10*2+ 0]
+ paddw m8, m6, [t6+r10*2+ 0]
+ paddw m6, [t5+r10*2+ 0]
+ mova m3, [t4+r10*2+32]
+ paddw m3, [t2+r10*2+32]
+ mova m5, [t3+r10*2+32]
+ mova m7, [t1+r10*2+32]
+ paddw m9, m7, [t6+r10*2+32]
+ paddw m7, [t5+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m4, m8, m6
+ pmaddwd m4, m14
+ punpckhwd m6, m8, m6
+ pmaddwd m6, m14
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m9, m7
+ pmaddwd m5, m14
+ punpckhwd m7, m9, m7
+ pmaddwd m7, m14
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov wd, wm
+ vbroadcasti128 m6, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m7, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m8, [sgr_shuf+6]
+ add lpfq, wq
+ vpbroadcastd m9, [pw_m16380]
+ vpbroadcastd m10, [pw_2056]
+ mova m11, [wiener_l_shuf]
+ vpbroadcastd m14, [fltq+16] ; __ y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq*2+16]
+ psllw m14, 5
+ neg wq
+ psllw m15, 5
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, m11
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, m11
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t3+r10*2]
+ paddw m2, [t1+r10*2]
+ mova m3, [t2+r10*2]
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ paddw m4, m0, [t4+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+r10*2+32]
+ paddw m2, [t1+r10*2+32]
+ mova m3, [t2+r10*2+32]
+ packuswb m0, m4
+ paddw m4, m1, [t4+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+ psrld m13, m14, 16 ; y1 __
+.v_loop:
+ mova m6, [t1+r10*2+ 0]
+ paddw m2, m6, [t3+r10*2+ 0]
+ mova m4, [t2+r10*2+ 0]
+ mova m7, [t1+r10*2+32]
+ paddw m3, m7, [t3+r10*2+32]
+ mova m5, [t2+r10*2+32]
+ paddw m6, [t4+r10*2+ 0]
+ paddw m7, [t4+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m7, m6
+ pmaddwd m4, m5, m14
+ punpckhwd m7, m6
+ pmaddwd m6, m7, m14
+ pmaddwd m5, m13
+ pmaddwd m7, m13
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r12-sgr_x_by_x_avx2-256*4
+ lea r12, [sgr_x_by_x_avx2+256*4]
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m8, [base+sgr_shuf+0]
+ vbroadcasti128 m9, [base+sgr_shuf+8]
+ add lpfq, wq
+ vbroadcasti128 m10, [base+sgr_shuf+2]
+ add dstq, wq
+ vbroadcasti128 m11, [base+sgr_shuf+6]
+ lea t3, [rsp+wq*4+16+400*12]
+ vpbroadcastd m12, [paramsq+0] ; s0
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m13, [base+pd_0xf00800a4]
+ neg wq
+ vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
+ psllw m7, 4
+ vpbroadcastd m15, [base+pd_m4096]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ movd xm2, r10d
+ mova m0, [sgr_r_ext]
+ vpbroadcastb m2, xm2
+ psubb m0, m2
+ pminub m0, [pb_0to31]
+ pshufb m5, m0
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m3, m5, m8
+ pmullw m4, m3, m3
+ pshufb m2, m5, m9
+ paddw m0, m3, m2
+ shufps m3, m2, q2121
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ punpcklwd m3, m4, m6
+ paddd m1, m3
+ punpckhwd m4, m6
+ paddd m2, m4
+ pshufb m4, m5, m10
+ paddw m0, m4
+ pshufb m5, m11
+ paddw m0, m5 ; sum
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ punpckhwd m4, m5
+ pmaddwd m4, m4
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10*2+400*0]
+ paddd m1, [t1+r10*2+400*2]
+ paddd m2, [t1+r10*2+400*4]
+.h_loop_end:
+ paddd m1, m3 ; sumsq
+ paddd m2, m4
+ mova [t1+r10*2+400*0], m0
+ mova [t1+r10*2+400*2], m1
+ mova [t1+r10*2+400*4], m2
+ add r10, 16
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-2]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10*2+400*0]
+ mova m1, [t1+r10*2+400*2]
+ mova m2, [t1+r10*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10*2+400*0], m0
+ mova [t2+r10*2+400*2], m1
+ mova [t2+r10*2+400*4], m2
+ add r10, 16
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu xm5, [lpfq+r10-2]
+.hv_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m1, m5, m8
+ pmullw m4, m1, m1
+ pshufb m3, m5, m9
+ paddw m0, m1, m3
+ shufps m1, m3, q2121
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ punpcklwd m1, m4, m6
+ paddd m2, m1
+ punpckhwd m4, m6
+ paddd m3, m4
+ pshufb m1, m5, m10
+ paddw m0, m1
+ pshufb m5, m11
+ paddw m0, m5 ; h sum
+ punpcklwd m4, m5, m1
+ pmaddwd m4, m4
+ punpckhwd m5, m1
+ pmaddwd m5, m5
+ paddw m1, m0, [t1+r10*2+400*0]
+ paddd m2, m4 ; h sumsq
+ paddd m3, m5
+ paddd m4, m2, [t1+r10*2+400*2]
+ paddd m5, m3, [t1+r10*2+400*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10*2+400*0] ; hv sum
+ paddd m4, [t2+r10*2+400*2] ; hv sumsq
+ paddd m5, [t2+r10*2+400*4]
+ mova [t0+r10*2+400*0], m0
+ mova [t0+r10*2+400*2], m2
+ mova [t0+r10*2+400*4], m3
+ vpbroadcastd m2, [pd_25]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m4, m2 ; a * 25
+ pmulld m5, m2
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m12 ; p * s
+ pmulld m5, m12
+ pmaddwd m0, m13 ; b * 164
+ pmaddwd m1, m13
+ paddusw m4, m13
+ paddusw m5, m13
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ pand m0, m15
+ pand m1, m15
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires
+ vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b.
+ mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but
+ vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way.
+ add r10, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10*2+400*0], m1
+ paddw m1, m0
+ mova [t1+r10*2+400*2], m4
+ paddd m4, m2
+ mova [t1+r10*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m0, [t1+r10*2+400*0]
+ mova m2, [t1+r10*2+400*2]
+ mova m3, [t1+r10*2+400*4]
+ paddw m1, m0, [t2+r10*2+400*0]
+ paddd m4, m2, [t2+r10*2+400*2]
+ paddd m5, m3, [t2+r10*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ vpbroadcastd m2, [pd_25]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m4, m2 ; a * 25
+ pmulld m5, m2
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m12 ; p * s
+ pmulld m5, m12
+ pmaddwd m0, m13 ; b * 164
+ pmaddwd m1, m13
+ paddusw m4, m13
+ paddusw m5, m13
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ pand m0, m15
+ pand m1, m15
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+36]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+32]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+40]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0 ; ab 565
+ paddd m3, m1
+ pandn m0, m15, m2 ; a
+ psrld m2, 12 ; b
+ pandn m1, m15, m3
+ psrld m3, 12
+ mova [t3+r10*4+400*4+ 0], m0
+ mova [t3+r10*4+400*8+ 0], m2
+ mova [t3+r10*4+400*4+32], m1
+ mova [t3+r10*4+400*8+32], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+36]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+32]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+40]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0
+ paddd m3, m1
+ pandn m0, m15, m2
+ psrld m2, 12
+ pandn m1, m15, m3
+ psrld m3, 12
+ paddd m4, m0, [t3+r10*4+400*4+ 0] ; a
+ paddd m5, m1, [t3+r10*4+400*4+32]
+ mova [t3+r10*4+400*4+ 0], m0
+ mova [t3+r10*4+400*4+32], m1
+ paddd m0, m2, [t3+r10*4+400*8+ 0] ; b
+ paddd m1, m3, [t3+r10*4+400*8+32]
+ mova [t3+r10*4+400*8+ 0], m2
+ mova [t3+r10*4+400*8+32], m3
+ pmovzxbd m2, [dstq+r10+0]
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2 ; a * src
+ pmaddwd m5, m3
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 8)
+ psubd m1, m5
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ pmovzxbd m2, [dstq+r10+0]
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src
+ pmaddwd m5, m3, [t3+r10*4+400*4+32]
+ mova m0, [t3+r10*4+400*8+ 0] ; b
+ mova m1, [t3+r10*4+400*8+32]
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 7)
+ psubd m1, m5
+ psrad m0, 8
+ psrad m1, 8
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r14-sgr_x_by_x_avx2-256*4
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ lea r14, [sgr_x_by_x_avx2+256*4]
+ vbroadcasti128 m8, [base+sgr_shuf+2]
+ add lpfq, wq
+ vbroadcasti128 m9, [base+sgr_shuf+4]
+ add dstq, wq
+ vbroadcasti128 m10, [base+sgr_shuf+6]
+ lea t3, [rsp+wq*4+16+400*12]
+ vpbroadcastd m11, [paramsq+ 4] ; s1
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m12, [base+pd_0xf00801c7]
+ neg wq
+ vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15)
+ psllw m7, 4
+ vpbroadcastd m14, [base+pd_m4096]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add t4, strideq
+ mov [rsp], t4 ; below
+ mov t0, t2
+ call .hv
+.main:
+ mov t5, t3
+ add t3, 400*4
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ dec hd
+ jz .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv
+ call .n
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv_bottom
+ call .n
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n
+ RET
+.height1:
+ call .v
+ call .prep_n
+ mov t2, t1
+ call .v
+ jmp .end
+.extend_bottom:
+ call .v
+ call .n
+ mov t2, t1
+ call .v
+ jmp .end
+.no_top:
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea t4, [t4+strideq*2]
+ mov [rsp], t4
+ call .h
+ lea t0, [t1+400*6]
+ mov t2, t1
+ call .v
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m0, m5, m8
+ pmullw m2, m0, m0
+ pshufb m4, m5, m9
+ paddw m0, m4
+ pshufb m5, m10
+ paddw m0, m5 ; sum
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ punpckhwd m4, m5
+ pmaddwd m4, m4
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ mova [t1+r10*2+400*0], m0
+ paddd m1, m3 ; sumsq
+ paddd m2, m4
+ mova [t1+r10*2+400*2], m1
+ mova [t1+r10*2+400*4], m2
+ add r10, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu xm5, [lpfq+r10-2]
+.hv_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m5, m8
+ pmullw m3, m0, m0
+ pshufb m1, m5, m9
+ paddw m0, m1
+ pshufb m5, m10
+ paddw m0, m5 ; h sum
+ punpcklwd m4, m5, m1
+ pmaddwd m4, m4
+ punpckhwd m5, m1
+ pmaddwd m5, m5
+ paddw m1, m0, [t2+r10*2+400*0]
+ paddw m1, [t1+r10*2+400*0] ; hv sum
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ paddd m4, m2 ; h sumsq
+ paddd m5, m3
+ paddd m2, m4, [t2+r10*2+400*2]
+ paddd m3, m5, [t2+r10*2+400*4]
+ paddd m2, [t1+r10*2+400*2] ; hv sumsq
+ paddd m3, [t1+r10*2+400*4]
+ mova [t0+r10*2+400*0], m0
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ mova [t0+r10*2+400*2], m4
+ pslld m4, m2, 3
+ mova [t0+r10*2+400*4], m5
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ pmaddwd m2, m0, m0 ; b * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m11 ; p * s
+ pmulld m5, m11
+ pmaddwd m0, m12 ; b * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r14+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r14+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ pand m0, m14
+ pand m1, m14
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m1, [t1+r10*2+400*0]
+ paddw m1, m1
+ paddw m1, [t2+r10*2+400*0] ; hv sum
+ mova m2, [t1+r10*2+400*2]
+ mova m3, [t1+r10*2+400*4]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m2, [t2+r10*2+400*2] ; hv sumsq
+ paddd m3, [t2+r10*2+400*4]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ pmaddwd m2, m0, m0 ; b * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m11 ; p * s
+ pmulld m5, m11
+ pmaddwd m0, m12 ; b * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r14+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r14+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ pand m0, m14
+ pand m1, m14
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+ mov t4, t3
+ add t3, 400*4
+.prep_n_loop:
+ mova m2, [t5+r10*4+0]
+ mova m3, [t4+r10*4+0]
+ paddd m2, [t5+r10*4+8]
+ paddd m3, [t4+r10*4+8]
+ paddd m0, m2, [t5+r10*4+4]
+ paddd m1, m3, [t4+r10*4+4]
+ pslld m0, 2
+ paddd m1, m1 ; ab[ 0] 222
+ psubd m0, m2 ; ab[-1] 343
+ mova [t3+r10*4+400*4], m1
+ paddd m1, m1
+ mova [t5+r10*4], m0
+ psubd m1, m3 ; ab[ 0] 343
+ mova [t4+r10*4], m1
+ add r10, 8
+ jl .prep_n_loop
+ ret
+; a+b are packed together in a single dword, but we can't do the
+; full neighbor calculations before splitting them since we don't
+; have sufficient precision. The solution is to do the calculations
+; in two equal halves and split a and b before doing the final sum.
+ALIGN function_align
+.n: ; neighbor + output
+ mov r10, wq
+.n_loop:
+ mova m4, [t3+r10*4+ 0]
+ paddd m4, [t3+r10*4+ 8]
+ paddd m5, m4, [t3+r10*4+ 4]
+ paddd m5, m5 ; ab[+1] 222
+ mova m2, [t3+r10*4+400*4+ 0]
+ paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
+ mova m3, [t3+r10*4+400*4+32]
+ paddd m1, m3, [t5+r10*4+32]
+ mova [t3+r10*4+400*4+ 0], m5
+ paddd m5, m5
+ psubd m5, m4 ; ab[+1] 343
+ mova [t5+r10*4+ 0], m5
+ paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343
+ mova m4, [t3+r10*4+32]
+ paddd m4, [t3+r10*4+40]
+ paddd m5, m4, [t3+r10*4+36]
+ paddd m5, m5
+ mova [t3+r10*4+400*4+32], m5
+ paddd m5, m5
+ psubd m5, m4
+ mova [t5+r10*4+32], m5
+ pandn m4, m14, m0
+ psrld m0, 12
+ paddd m3, m5
+ pandn m5, m14, m2
+ psrld m2, 12
+ paddd m4, m5 ; a
+ pandn m5, m14, m1
+ psrld m1, 12
+ paddd m0, m2 ; b + (1 << 8)
+ pandn m2, m14, m3
+ psrld m3, 12
+ paddd m5, m2
+ pmovzxbd m2, [dstq+r10+0]
+ paddd m1, m3
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2 ; a * src
+ pmaddwd m5, m3
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 8)
+ psubd m1, m5
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n_loop
+ mov r10, t5
+ mov t5, t4
+ mov t4, r10
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r12-sgr_x_by_x_avx2-256*4
+ lea r12, [sgr_x_by_x_avx2+256*4]
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m9, [base+sgr_shuf+0]
+ vbroadcasti128 m10, [base+sgr_shuf+8]
+ add lpfq, wq
+ vbroadcasti128 m11, [base+sgr_shuf+2]
+ vbroadcasti128 m12, [base+sgr_shuf+6]
+ add dstq, wq
+ vpbroadcastd m15, [paramsq+8] ; w0 w1
+ lea t3, [rsp+wq*4+400*24+8]
+ vpbroadcastd m13, [paramsq+0] ; s0
+ pxor m7, m7
+ vpbroadcastd m14, [paramsq+4] ; s1
+ lea t1, [rsp+wq*2+12]
+ neg wq
+ psllw m15, 2 ; to reuse existing pd_m4096 register for rounding
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*12]
+ lea r10, [wq-2]
+.top_fixup_loop:
+ mova m0, [t1+r10*2+400* 0]
+ mova m1, [t1+r10*2+400* 2]
+ mova m2, [t1+r10*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+r10*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+r10*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+r10*2+400*10]
+ mova [t2+r10*2+400* 0], m0
+ mova [t2+r10*2+400* 2], m1
+ mova [t2+r10*2+400* 4], m2
+ mova [t2+r10*2+400* 6], m3
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ add r10, 16
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsums
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+ paddw m8, m6, m4
+ shufps m0, m6, m4, q2121
+ pmullw m3, m0, m0
+ pshufb m2, m5, m11
+ paddw m0, m2
+ pshufb m5, m12
+ paddw m0, m5 ; sum3
+ punpcklwd m1, m2, m5
+ pmaddwd m1, m1
+ punpckhwd m2, m5
+ pmaddwd m2, m2
+ punpcklwd m5, m6, m4
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ punpcklwd m4, m3, m7
+ paddd m1, m4 ; sumsq3
+ punpckhwd m3, m7
+ paddd m2, m3
+ mova [t1+r10*2+400* 6], m0
+ mova [t1+r10*2+400* 8], m1
+ mova [t1+r10*2+400*10], m2
+ paddw m8, m0 ; sum5
+ paddd m5, m1 ; sumsq5
+ paddd m6, m2
+ mova [t1+r10*2+400* 0], m8
+ mova [t1+r10*2+400* 2], m5
+ mova [t1+r10*2+400* 4], m6
+ add r10, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu xm5, [lpfq+r10-2]
+.hv0_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -18
+ jl .hv0_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv0_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+ paddw m8, m6, m4
+ shufps m1, m6, m4, q2121
+ pmullw m0, m1, m1
+ pshufb m3, m5, m11
+ paddw m1, m3
+ pshufb m5, m12
+ paddw m1, m5 ; sum3
+ punpcklwd m2, m3, m5
+ pmaddwd m2, m2
+ punpckhwd m3, m5
+ pmaddwd m3, m3
+ punpcklwd m5, m6, m4
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ punpcklwd m4, m0, m7
+ paddd m2, m4 ; sumsq3
+ punpckhwd m0, m7
+ paddd m3, m0
+ paddw m8, m1 ; sum5
+ paddd m5, m2 ; sumsq5
+ paddd m6, m3
+ mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
+ mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
+ mova [t3+r10*4+400*0+40], m6
+ paddw m8, [t1+r10*2+400* 0]
+ paddd m5, [t1+r10*2+400* 2]
+ paddd m6, [t1+r10*2+400* 4]
+ mova [t1+r10*2+400* 0], m8
+ mova [t1+r10*2+400* 2], m5
+ mova [t1+r10*2+400* 4], m6
+ paddw m0, m1, [t1+r10*2+400* 6]
+ paddd m4, m2, [t1+r10*2+400* 8]
+ paddd m5, m3, [t1+r10*2+400*10]
+ mova [t1+r10*2+400* 6], m1
+ mova [t1+r10*2+400* 8], m2
+ mova [t1+r10*2+400*10], m3
+ paddw m1, m0, [t2+r10*2+400* 6]
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m0
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ vpbroadcastd m4, [base+pd_34816]
+ pmulld m0, m2
+ vpbroadcastd m5, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m4
+ pand m0, m5
+ pand m1, m5
+ por m0, m2 ; a3 | (b3 << 12)
+ por m1, m3
+ mova [t3+r10*4+400*4+ 8], xm0
+ vextracti128 [t3+r10*4+400*4+40], m0, 1
+ mova [t3+r10*4+400*4+24], xm1
+ vextracti128 [t3+r10*4+400*4+56], m1, 1
+ add r10, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu xm5, [lpfq+r10-2]
+.hv1_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -18
+ jl .hv1_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv1_have_right:
+ pshufb m6, m5, m9
+ pshufb m3, m5, m10
+ paddw m8, m6, m3
+ shufps m2, m6, m3, q2121
+ pmullw m1, m2, m2
+ pshufb m0, m5, m11
+ paddw m2, m0
+ pshufb m5, m12
+ paddw m2, m5 ; sum3
+ punpcklwd m4, m5, m0
+ pmaddwd m4, m4
+ punpckhwd m5, m0
+ pmaddwd m5, m5
+ punpcklwd m0, m6, m3
+ pmaddwd m0, m0
+ punpckhwd m6, m3
+ pmaddwd m6, m6
+ punpcklwd m3, m1, m7
+ paddd m4, m3 ; sumsq3
+ punpckhwd m1, m7
+ paddd m5, m1
+ paddw m1, m2, [t2+r10*2+400* 6]
+ mova [t2+r10*2+400* 6], m2
+ paddw m8, m2 ; sum5
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ paddd m4, m0 ; sumsq5
+ paddd m5, m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m6, m2, 3
+ pslld m7, m3, 3
+ paddd m6, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m7, m3
+ pmaddwd m3, m1, m1
+ psubd m6, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m7, m3
+ pmulld m6, m14 ; p3 * s1
+ pmulld m7, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m6, m2
+ paddusw m7, m2
+ psrad m3, m6, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m6
+ psrad m6, m7, 20
+ vpgatherdd m3, [r12+m6*4], m7
+ vpbroadcastd m6, [base+pd_34816] ; x3
+ pmulld m0, m2
+ vpbroadcastd m7, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ pand m0, m7
+ pand m7, m1
+ por m0, m2 ; a3 | (b3 << 12)
+ por m7, m3
+ paddw m1, m8, [t2+r10*2+400*0]
+ paddd m2, m4, [t2+r10*2+400*2]
+ paddd m3, m5, [t2+r10*2+400*4]
+ paddw m1, [t1+r10*2+400*0]
+ paddd m2, [t1+r10*2+400*2]
+ paddd m3, [t1+r10*2+400*4]
+ mova [t2+r10*2+400*0], m8
+ mova [t2+r10*2+400*2], m4
+ mova [t2+r10*2+400*4], m5
+ mova [t3+r10*4+400*8+ 8], xm0
+ vextracti128 [t3+r10*4+400*8+40], m0, 1
+ mova [t3+r10*4+400*8+24], xm7
+ vextracti128 [t3+r10*4+400*8+56], m7, 1
+ vpbroadcastd m4, [base+pd_25]
+ pxor m7, m7
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmulld m2, m4 ; a5 * 25
+ pmulld m3, m4
+ pmaddwd m4, m0, m0 ; b5 * b5
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [base+pd_0xf00800a4]
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r12+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r12+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ vpbroadcastd m6, [base+pd_m4096]
+ pand m0, m6
+ pand m1, m6
+ por m0, m4 ; a5 | (b5 << 12)
+ por m1, m5
+ mova [t3+r10*4+400*0+ 8], xm0
+ vextracti128 [t3+r10*4+400*0+40], m0, 1
+ mova [t3+r10*4+400*0+24], xm1
+ vextracti128 [t3+r10*4+400*0+56], m1, 1
+ add r10, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-2]
+ vpbroadcastd m6, [base+pd_34816]
+ vpbroadcastd m8, [base+pd_m4096]
+.v0_loop:
+ mova m0, [t1+r10*2+400* 6]
+ mova m4, [t1+r10*2+400* 8]
+ mova m5, [t1+r10*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10*2+400* 6]
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m0
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ pand m0, m8
+ pand m1, m8
+ por m0, m2 ; a3 | (b3 << 12)
+ por m1, m3
+ mova m2, [t1+r10*2+400*0]
+ mova m3, [t1+r10*2+400*2]
+ mova m4, [t1+r10*2+400*4]
+ mova [t3+r10*4+400*8+ 8], m2
+ mova [t3+r10*4+400*0+ 8], m3
+ mova [t3+r10*4+400*0+40], m4
+ paddw m2, m2 ; cc5
+ paddd m3, m3
+ paddd m4, m4
+ mova [t1+r10*2+400*0], m2
+ mova [t1+r10*2+400*2], m3
+ mova [t1+r10*2+400*4], m4
+ mova [t3+r10*4+400*4+ 8], xm0
+ vextracti128 [t3+r10*4+400*4+40], m0, 1
+ mova [t3+r10*4+400*4+24], xm1
+ vextracti128 [t3+r10*4+400*4+56], m1, 1
+ add r10, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+.v1_loop:
+ mova m4, [t1+r10*2+400* 6]
+ mova m5, [t1+r10*2+400* 8]
+ mova m6, [t1+r10*2+400*10]
+ paddw m1, m4, [t2+r10*2+400* 6]
+ paddd m2, m5, [t2+r10*2+400* 8]
+ paddd m3, m6, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m4
+ mova [t2+r10*2+400* 8], m5
+ mova [t2+r10*2+400*10], m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ vpbroadcastd m4, [base+pd_34816]
+ pmulld m0, m2
+ vpbroadcastd m8, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m4
+ pand m0, m8
+ pand m8, m1
+ por m0, m2 ; a3 | (b3 << 12)
+ por m8, m3
+ mova m4, [t3+r10*4+400*8+ 8]
+ mova m5, [t3+r10*4+400*0+ 8]
+ mova m6, [t3+r10*4+400*0+40]
+ paddw m1, m4, [t2+r10*2+400*0]
+ paddd m2, m5, [t2+r10*2+400*2]
+ paddd m3, m6, [t2+r10*2+400*4]
+ paddw m1, [t1+r10*2+400*0]
+ paddd m2, [t1+r10*2+400*2]
+ paddd m3, [t1+r10*2+400*4]
+ mova [t2+r10*2+400*0], m4
+ mova [t2+r10*2+400*2], m5
+ mova [t2+r10*2+400*4], m6
+ vpbroadcastd m4, [base+pd_25]
+ mova [t3+r10*4+400*8+ 8], xm0
+ vextracti128 [t3+r10*4+400*8+40], m0, 1
+ mova [t3+r10*4+400*8+24], xm8
+ vextracti128 [t3+r10*4+400*8+56], m8, 1
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmulld m2, m4 ; a5 * 25
+ pmulld m3, m4
+ pmaddwd m4, m0, m0 ; b5 * b5
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [base+pd_0xf00800a4]
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r12+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r12+m2*4], m3
+ pmulld m0, m4
+ vpbroadcastd m6, [base+pd_34816]
+ pmulld m1, m5
+ paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ vpbroadcastd m6, [base+pd_m4096]
+ pand m0, m6
+ pand m1, m6
+ por m0, m4 ; a5 | (b5 << 12)
+ por m1, m5
+ mova [t3+r10*4+400*0+ 8], xm0
+ vextracti128 [t3+r10*4+400*0+40], m0, 1
+ mova [t3+r10*4+400*0+24], xm1
+ vextracti128 [t3+r10*4+400*0+56], m1, 1
+ add r10, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+400*0+4]
+ paddd m1, m0, [t3+r10*4+400*0+0]
+ mova m4, [t3+r10*4+400*4+0]
+ paddd m1, [t3+r10*4+400*0+8]
+ mova m5, [t3+r10*4+400*8+0]
+ paddd m4, [t3+r10*4+400*4+8]
+ paddd m5, [t3+r10*4+400*8+8]
+ paddd m2, m4, [t3+r10*4+400*4+4]
+ paddd m3, m5, [t3+r10*4+400*8+4]
+ paddd m0, m1
+ pslld m1, 2
+ pslld m2, 2
+ paddd m1, m0 ; ab5 565
+ paddd m3, m3 ; ab3[ 0] 222
+ psubd m2, m4 ; ab3[-1] 343
+ mova [t3+r10*4+400*20], m3
+ pandn m0, m6, m1 ; a5 565
+ mova [t3+r10*4+400*24], m2
+ psrld m1, 12 ; b5 565
+ mova [t3+r10*4+400*12], m0
+ paddd m3, m3
+ mova [t3+r10*4+400*16], m1
+ psubd m3, m5 ; ab3[ 0] 343
+ mova [t3+r10*4+400*28], m3
+ add r10, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t3+r10*4+4]
+ paddd m4, m0, [t3+r10*4+0]
+ paddd m4, [t3+r10*4+8]
+ paddd m0, m4
+ pslld m4, 2
+ paddd m4, m0
+ pandn m0, m6, m4
+ psrld m4, 12
+ paddd m2, m0, [t3+r10*4+400*12] ; a5
+ mova [t3+r10*4+400*12], m0
+ paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
+ mova [t3+r10*4+400*16], m4
+ mova m3, [t3+r10*4+400*4+0]
+ paddd m3, [t3+r10*4+400*4+8]
+ paddd m5, m3, [t3+r10*4+400*4+4]
+ paddd m5, m5 ; ab3[ 1] 222
+ mova m4, [t3+r10*4+400*20]
+ paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+400*20], m5
+ paddd m5, m5
+ psubd m5, m3 ; ab3[ 1] 343
+ mova [t3+r10*4+400*24], m5
+ paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m3, m6, m1
+ psrld m1, 12
+ pandn m5, m6, m4
+ psrld m4, 12
+ paddd m3, m5 ; a3
+ paddd m1, m4 ; b3 + (1 << 8)
+ pmovzxbd m4, [dstq+r10]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ psubd m0, m6
+ psrad m0, 13
+ paddd m0, m4
+ vextracti128 xm1, m0, 1
+ packssdw xm0, xm1
+ packuswb xm0, xm0
+ movq [dstq+r10], xm0
+ add r10, 8
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t3+r10*4+400*8+0]
+ paddd m3, [t3+r10*4+400*8+8]
+ paddd m5, m3, [t3+r10*4+400*8+4]
+ paddd m5, m5 ; ab3[ 1] 222
+ mova m4, [t3+r10*4+400*20]
+ paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+400*20], m5
+ paddd m5, m5
+ psubd m5, m3 ; ab3[ 1] 343
+ mova [t3+r10*4+400*28], m5
+ paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m3, m6, m1
+ psrld m1, 12
+ pandn m5, m6, m4
+ psrld m4, 12
+ paddd m3, m5 ; -a3
+ paddd m1, m4 ; b3 + (1 << 8)
+ pmovzxbd m4, [dstq+r10]
+ pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src
+ mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7)
+ pmaddwd m3, m4 ; -a3 * src
+ psubd m0, m2 ; a5 * src + b5 + (1 << 7)
+ psubd m1, m3 ; a3 * src + b3 + (1 << 8)
+ psrld m0, 8
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ psubd m0, m6
+ psrad m0, 13
+ paddd m0, m4
+ vextracti128 xm1, m0, 1
+ packssdw xm0, xm1
+ packuswb xm0, xm0
+ movq [dstq+r10], xm0
+ add r10, 8
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration_avx512.asm b/third_party/dav1d/src/x86/looprestoration_avx512.asm
new file mode 100644
index 0000000000..1e571774ca
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_avx512.asm
@@ -0,0 +1,2122 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12
+wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13
+wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47
+ db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
+sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8
+ db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128
+sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+r_ext_mask: times 68 db -1
+ times 4 db 0
+wiener_x_shuf: db 0, 2, -1, 0
+wiener_x_add: db 0, 1,127, 0
+
+pw_61448: times 2 dw 61448
+pw_164_455: dw 164, 455
+pd_m16380: dd -16380
+pd_m4096: dd -4096
+pd_m25 dd -25
+pd_m9: dd -9
+pd_34816: dd 34816
+pd_8421376: dd 8421376
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m6, [wiener_shufA]
+ vbroadcasti32x4 m7, [wiener_shufB]
+ mov r10d, 0xfffe
+ vbroadcasti32x4 m8, [wiener_shufC]
+ vbroadcasti32x4 m9, [wiener_shufD]
+ kmovw k1, r10d
+ vpbroadcastd m0, [wiener_x_shuf]
+ vpbroadcastd m1, [wiener_x_add]
+ mov r10, 0xaaaaaaaaaaaaaaaa
+ vpbroadcastd m11, [fltq+ 0]
+ vpbroadcastd m12, [fltq+ 4]
+ kmovq k2, r10
+ vpbroadcastd m10, [pd_m16380]
+ packsswb m11, m11 ; x0 x1 x0 x1
+ vpbroadcastd m14, [fltq+16]
+ pshufb m12, m0
+ vpbroadcastd m15, [fltq+20]
+ paddb m12, m1 ; x2 x3+1 x2 127
+ vpbroadcastd m13, [pd_8421376]
+ psllw m14, 5 ; y0 y1
+ psllw m15, 5 ; y2 y3
+ cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32
+ jle .w32 ; pixels, so we need a special case for small widths
+ lea t1, [rsp+wq*2+16]
+ add lpfq, wq
+ add dstq, wq
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception
+ vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10-4]
+.h_main:
+ movu m17, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+65]
+ vpbroadcastb m0, [lpfq-1]
+ vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r0+r10+8], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m2, m10
+ vpdpbusd m2, m4, m11
+ pshufb m4, m17, m6
+ mova m1, m10
+ vpdpbusd m1, m4, m11
+ pshufb m4, m17, m7
+ mova m3, m10
+ vpdpbusd m3, m4, m11
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m2, m16, m12
+ pshufb m4, m17, m8
+ vpdpbusd m1, m4, m12
+ pshufb m17, m9
+ vpdpbusd m3, m17, m12
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 3
+ psraw m1, 3
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+64], m1
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm16, [lpfq+r10]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10-4]
+.hv_main:
+ movu m17, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+65]
+ vpbroadcastb m0, [lpfq-1]
+ vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r0+r10+8], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m2, m10
+ vpdpbusd m2, m4, m11
+ pshufb m4, m17, m6
+ mova m1, m10
+ vpdpbusd m1, m4, m11
+ pshufb m4, m17, m7
+ mova m3, m10
+ vpdpbusd m3, m4, m11
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m2, m16, m12
+ pshufb m4, m17, m8
+ vpdpbusd m1, m4, m12
+ pshufb m17, m9
+ vpdpbusd m3, m17, m12
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 3
+ psraw m1, 3
+ mova m16, [t4+r10*2]
+ paddw m16, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ mova m17, [t4+r10*2+64]
+ paddw m17, [t2+r10*2+64]
+ mova m5, [t3+r10*2+64]
+ punpcklwd m4, m16, m3
+ mova m2, m13
+ vpdpwssd m2, m4, m15
+ punpcklwd m18, m17, m5
+ mova m4, m13
+ vpdpwssd m4, m18, m15
+ punpckhwd m16, m3
+ mova m3, m13
+ vpdpwssd m3, m16, m15
+ punpckhwd m17, m5
+ mova m5, m13
+ vpdpwssd m5, m17, m15
+ mova m17, [t5+r10*2]
+ paddw m17, [t1+r10*2]
+ paddw m16, m0, [t6+r10*2]
+ mova m19, [t5+r10*2+64]
+ paddw m19, [t1+r10*2+64]
+ paddw m18, m1, [t6+r10*2+64]
+ mova [t0+r10*2+ 0], m0
+ mova [t0+r10*2+64], m1
+ punpcklwd m0, m16, m17
+ vpdpwssd m2, m0, m14
+ punpcklwd m1, m18, m19
+ vpdpwssd m4, m1, m14
+ punpckhwd m16, m17
+ vpdpwssd m3, m16, m14
+ punpckhwd m18, m19
+ vpdpwssd m5, m18, m14
+ packuswb m2, m4
+ psrlw m2, 8
+ vpackuswb m2{k2}, m3, m5
+ movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
+ add r10, 64 ; function is used for chroma as well, and in some
+ jl .hv_loop ; esoteric edge cases chroma dst pointers may only
+ mov t6, t5 ; have a 32-byte alignment despite having a width
+ mov t5, t4 ; larger than 32, so use an unaligned store here.
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m4, [t4+r10*2+ 0]
+ paddw m4, [t2+r10*2+ 0]
+ mova m1, [t3+r10*2+ 0]
+ mova m5, [t4+r10*2+64]
+ paddw m5, [t2+r10*2+64]
+ mova m3, [t3+r10*2+64]
+ punpcklwd m6, m4, m1
+ mova m0, m13
+ vpdpwssd m0, m6, m15
+ punpcklwd m6, m5, m3
+ mova m2, m13
+ vpdpwssd m2, m6, m15
+ punpckhwd m4, m1
+ mova m1, m13
+ vpdpwssd m1, m4, m15
+ punpckhwd m5, m3
+ mova m3, m13
+ vpdpwssd m3, m5, m15
+ mova m5, [t1+r10*2+ 0]
+ paddw m4, m5, [t6+r10*2+ 0]
+ paddw m5, [t5+r10*2+ 0]
+ mova m7, [t1+r10*2+64]
+ paddw m6, m7, [t6+r10*2+64]
+ paddw m7, [t5+r10*2+64]
+ punpcklwd m8, m4, m5
+ vpdpwssd m0, m8, m14
+ punpcklwd m8, m6, m7
+ vpdpwssd m2, m8, m14
+ punpckhwd m4, m5
+ vpdpwssd m1, m4, m14
+ punpckhwd m6, m7
+ vpdpwssd m3, m6, m14
+ packuswb m0, m2
+ psrlw m0, 8
+ vpackuswb m0{k2}, m1, m3
+ movu [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+.w32:
+ lea r10, [r_ext_mask+73]
+ mova ym18, [wiener_perm32]
+ lea t1, [rsp+16]
+ sub r10, wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .w32_no_top
+ call .w32_h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 32*2
+ call .w32_h_top
+ lea r9, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 32*2
+ add r9, strideq
+ mov [rsp], r9 ; below
+ call .w32_h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .w32_v1
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ mov t2, t1
+ dec hd
+ jz .w32_v2
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ dec hd
+ jz .w32_v3
+.w32_main:
+ lea t0, [t1+32*2]
+.w32_main_loop:
+ call .w32_hv
+ dec hd
+ jnz .w32_main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .w32_v3
+ mov lpfq, [rsp]
+ call .w32_hv_bottom
+ add lpfq, strideq
+ call .w32_hv_bottom
+.w32_v1:
+ call .w32_v
+ RET
+.w32_no_top:
+ lea r9, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r9, [r9+strideq*2]
+ mov [rsp], r9
+ call .w32_h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .w32_v1
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ mov t2, t1
+ dec hd
+ jz .w32_v2
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ dec hd
+ jz .w32_v3
+ lea t0, [t1+32*2]
+ call .w32_hv
+ dec hd
+ jz .w32_v3
+ add t0, 32*8
+ call .w32_hv
+ dec hd
+ jnz .w32_main
+.w32_v3:
+ call .w32_v
+.w32_v2:
+ call .w32_v
+ jmp .w32_v1
+.w32_h:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_h_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ add leftq, 4
+ jmp .w32_h_main
+.w32_h_extend_left:
+ vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception
+ vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
+ jmp .w32_h_main
+.w32_h_top:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_h_extend_left
+ movu ym16, [lpfq-4]
+.w32_h_main:
+ vinserti32x8 m16, [lpfq+4], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .w32_h_have_right
+ vpbroadcastb m0, [lpfq+wq-1]
+ movu ym17, [r10-8]
+ vinserti32x8 m17, [r10+0], 1
+ vpternlogd m16, m0, m17, 0xe4 ; c ? a : b
+.w32_h_have_right:
+ pshufb m2, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m2, m11
+ pshufb m2, m16, m7
+ mova m1, m10
+ vpdpbusd m1, m2, m11
+ pshufb m2, m16, m8
+ vpdpbusd m0, m2, m12
+ pshufb m16, m9
+ vpdpbusd m1, m16, m12
+ packssdw m0, m1
+ psraw m0, 3
+ mova [t1], m0
+ ret
+.w32_hv:
+ add lpfq, strideq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_hv_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ add leftq, 4
+ jmp .w32_hv_main
+.w32_hv_extend_left:
+ vpbroadcastb xm16, [lpfq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ jmp .w32_hv_main
+.w32_hv_bottom:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_hv_extend_left
+ movu ym16, [lpfq-4]
+.w32_hv_main:
+ vinserti32x8 m16, [lpfq+4], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .w32_hv_have_right
+ vpbroadcastb m0, [lpfq+wq-1]
+ movu ym17, [r10-8]
+ vinserti32x8 m17, [r10+0], 1
+ vpternlogd m16, m0, m17, 0xe4
+.w32_hv_have_right:
+ mova m3, [t4]
+ paddw m3, [t2]
+ mova m2, [t3]
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m5, m10
+ vpdpbusd m5, m4, m11
+ punpcklwd m4, m3, m2
+ mova m1, m13
+ vpdpwssd m1, m4, m15
+ punpckhwd m3, m2
+ mova m2, m13
+ vpdpwssd m2, m3, m15
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m5, m16, m12
+ packssdw m0, m5
+ psraw m0, 3
+ mova m4, [t5]
+ paddw m4, [t1]
+ paddw m3, m0, [t6]
+ mova [t0], m0
+ punpcklwd m0, m3, m4
+ vpdpwssd m1, m0, m14
+ punpckhwd m3, m4
+ vpdpwssd m2, m3, m14
+ packuswb m1, m2
+ vpermb m16, m18, m1
+ mova [dstq], ym16
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.w32_v:
+ mova m2, [t4]
+ paddw m2, [t2]
+ mova m1, [t3]
+ mova m4, [t1]
+ paddw m3, m4, [t6]
+ paddw m4, [t5]
+ punpcklwd m5, m2, m1
+ mova m0, m13
+ vpdpwssd m0, m5, m15
+ punpckhwd m2, m1
+ mova m1, m13
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ packuswb m0, m1
+ vpermb m16, m18, m0
+ mova [dstq], ym16
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ mov hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m5, [sgr_shuf+1]
+ add lpfq, wq
+ vbroadcasti32x4 m6, [sgr_shuf+9]
+ add dstq, wq
+ vbroadcasti32x4 m7, [sgr_shuf+3]
+ lea t3, [rsp+wq*4+16+416*12]
+ vbroadcasti32x4 m8, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m9, [pd_m25]
+ vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
+ vpbroadcastw m15, [paramsq+8] ; w0
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m10, [pw_164_455]
+ neg wq
+ vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
+ mov r10d, 0xfe
+ vpbroadcastd m13, [pd_m4096]
+ kmovb k1, r10d
+ vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
+ mov r10, 0x3333333333333333
+ mova m18, [sgr_x_by_x+64*0]
+ kmovq k2, r10
+ mova m19, [sgr_x_by_x+64*1]
+ lea r12, [r_ext_mask+75]
+ mova m20, [sgr_x_by_x+64*2]
+ psllw m15, 4
+ mova m21, [sgr_x_by_x+64*3]
+ lea r10, [lpfq+strideq*4]
+ mova ym22, [sgr_shuf]
+ add r10, strideq
+ mov [rsp], r10 ; below
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m3, m17, m5
+ pmullw m2, m3, m3
+ pshufb m1, m17, m6
+ paddw m0, m3, m1
+ shufps m3, m1, q2121
+ paddw m0, m3
+ punpcklwd m16, m3, m1
+ punpckhwd m3, m1
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m16, m16
+ punpckhwd m2, m4
+ vpdpwssd m2, m3, m3
+ pshufb m16, m17, m7
+ paddw m0, m16
+ pshufb m17, m8
+ paddw m0, m17 ; sum
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3 ; sumsq
+ punpckhwd m16, m17
+ vpdpwssd m2, m16, m16
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10*2+416*0]
+ paddd m1, [t1+r10*2+416*2]
+ paddd m2, [t1+r10*2+416*4]
+.h_loop_end:
+ mova [t1+r10*2+416*0], m0
+ mova [t1+r10*2+416*2], m1
+ mova [t1+r10*2+416*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-2]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10*2+416*0]
+ mova m1, [t1+r10*2+416*2]
+ mova m2, [t1+r10*2+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10*2+416*0], m0
+ mova [t2+r10*2+416*2], m1
+ mova [t2+r10*2+416*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu ym17, [lpfq+r10-2]
+.hv_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv_have_right:
+ pshufb m1, m17, m5
+ pmullw m3, m1, m1
+ pshufb m2, m17, m6
+ paddw m0, m1, m2
+ shufps m1, m2, q2121
+ paddw m0, m1
+ punpcklwd m16, m1, m2
+ punpckhwd m1, m2
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16
+ punpckhwd m3, m4
+ vpdpwssd m3, m1, m1
+ pshufb m16, m17, m7
+ paddw m0, m16
+ pshufb m17, m8
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m16, m17
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10*2+416*0]
+ paddd m16, m2, [t1+r10*2+416*2]
+ paddd m17, m3, [t1+r10*2+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddd m16, [t2+r10*2+416*2] ; hv sumsq
+ paddd m17, [t2+r10*2+416*4]
+ paddw m1, [t2+r10*2+416*0] ; hv sum
+ mova [t0+r10*2+416*2], m2
+ mova [t0+r10*2+416*4], m3
+ mova [t0+r10*2+416*0], m0
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m0, m1, m4 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires
+ mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b.
+ vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
+ mova [t3+r10*4+ 72], m17 ; that gets us most of the way.
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10*2+416*0], m1
+ paddw m1, m0
+ mova [t1+r10*2+416*2], m16
+ paddd m16, m2
+ mova [t1+r10*2+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m2, [t1+r10*2+416*2]
+ paddd m16, m2, [t2+r10*2+416*2]
+ mova m3, [t1+r10*2+416*4]
+ paddd m17, m3, [t2+r10*2+416*4]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ mova m0, [t1+r10*2+416*0]
+ paddw m1, m0, [t2+r10*2+416*0]
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ punpcklwd m0, m1, m4 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+68]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+64]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+72]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0 ; ab 565
+ paddd m3, m1
+ pandn m0, m13, m2 ; a
+ psrld m2, 12 ; b
+ pandn m1, m13, m3
+ psrld m3, 12
+ mova [t3+r10*4+416*4+ 0], m0
+ mova [t3+r10*4+416*8+ 0], m2
+ mova [t3+r10*4+416*4+64], m1
+ mova [t3+r10*4+416*8+64], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m16, [t3+r10*4+ 4]
+ movu m17, [t3+r10*4+68]
+ paddd m0, m16, [t3+r10*4+ 0]
+ paddd m1, m17, [t3+r10*4+64]
+ paddd m0, [t3+r10*4+ 8]
+ paddd m1, [t3+r10*4+72]
+ paddd m16, m0
+ pslld m0, 2
+ paddd m17, m1
+ pslld m1, 2
+ paddd m0, m16
+ paddd m1, m17
+ pandn m16, m13, m0
+ psrld m0, 12
+ pandn m17, m13, m1
+ psrld m1, 12
+ paddd m2, m16, [t3+r10*4+416*4+ 0] ; a
+ paddd m3, m17, [t3+r10*4+416*4+64]
+ mova [t3+r10*4+416*4+ 0], m16
+ mova [t3+r10*4+416*4+64], m17
+ paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
+ paddd m17, m1, [t3+r10*4+416*8+64]
+ mova [t3+r10*4+416*8+ 0], m0
+ mova [t3+r10*4+416*8+64], m1
+ pmovzxbd m0, [dstq+r10+ 0]
+ pmovzxbd m1, [dstq+r10+16]
+ pmaddwd m2, m0 ; a * src
+ pmaddwd m3, m1
+ packssdw m0, m1
+ psubd m16, m2 ; b - a * src + (1 << 8)
+ psubd m17, m3
+ psrad m16, 9
+ psrad m17, 9
+ packssdw m16, m17
+ pmulhrsw m16, m15
+ paddw m16, m0
+ packuswb m16, m16
+ vpermd m16, m22, m16
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ pmovzxbd m0, [dstq+r10+ 0]
+ pmovzxbd m1, [dstq+r10+16]
+ pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src
+ pmaddwd m3, m1, [t3+r10*4+416*4+64]
+ mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7)
+ mova m17, [t3+r10*4+416*8+64]
+ packssdw m0, m1
+ psubd m16, m2 ; b - a * src + (1 << 7)
+ psubd m17, m3
+ psrad m16, 8
+ psrad m17, 8
+ packssdw m16, m17
+ pmulhrsw m16, m15
+ paddw m16, m0
+ packuswb m16, m16
+ vpermd m16, m22, m16
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m5, [sgr_shuf+3]
+ add lpfq, wq
+ vbroadcasti32x4 m6, [sgr_shuf+5]
+ add dstq, wq
+ vbroadcasti32x4 m7, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m8, [pd_m9]
+ vpsubd m11, m4, [paramsq+4] {1to16} ; -s1
+ vpbroadcastw m15, [paramsq+10] ; w1
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m10, [pw_164_455]
+ lea t3, [rsp+wq*4+16+416*12]
+ vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
+ neg wq
+ vpbroadcastd m13, [pd_m4096]
+ mov r10d, 0xfe
+ vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
+ kmovb k1, r10d
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10, 0x3333333333333333
+ mova m19, [sgr_x_by_x+64*1]
+ kmovq k2, r10
+ mova m20, [sgr_x_by_x+64*2]
+ psllw m15, 4
+ mova m21, [sgr_x_by_x+64*3]
+ lea r14, [r_ext_mask+75]
+ mova ym9, [sgr_shuf]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add t4, strideq
+ mov [rsp], t4 ; below
+ mov t0, t2
+ call .hv
+.main:
+ mov t5, t3
+ add t3, 416*4
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ dec hd
+ jz .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv
+ call .n
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv_bottom
+ call .n
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n
+ RET
+.height1:
+ call .v
+ call .prep_n
+ mov t2, t1
+ call .v
+ jmp .end
+.extend_bottom:
+ call .v
+ call .n
+ mov t2, t1
+ call .v
+ jmp .end
+.no_top:
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea t4, [t4+strideq*2]
+ mov [rsp], t4
+ call .h
+ lea t0, [t1+416*6]
+ mov t2, t1
+ call .v
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r14+r10-8]
+ vinserti32x8 m16, [r14+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m0, m17, m5
+ pmullw m2, m0, m0
+ pshufb m16, m17, m6
+ paddw m0, m16
+ pshufb m17, m7
+ paddw m0, m17 ; sum
+ punpcklwd m3, m16, m17
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m3, m3 ; sumsq
+ punpckhwd m16, m17
+ punpckhwd m2, m4
+ vpdpwssd m2, m16, m16
+ mova [t1+r10*2+416*0], m0
+ mova [t1+r10*2+416*2], m1
+ mova [t1+r10*2+416*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu ym17, [lpfq+r10-2]
+.hv_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r14+r10-8]
+ vinserti32x8 m16, [r14+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv_have_right:
+ pshufb m0, m17, m5
+ pmullw m3, m0, m0
+ pshufb m1, m17, m6
+ paddw m0, m1
+ pshufb m17, m7
+ paddw m0, m17 ; h sum
+ punpcklwd m16, m17, m1
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16 ; h sumsq
+ punpckhwd m17, m1
+ punpckhwd m3, m4
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10*2+416*0]
+ paddw m1, [t1+r10*2+416*0] ; hv sum
+ paddd m16, m2, [t2+r10*2+416*2]
+ paddd m17, m3, [t2+r10*2+416*4]
+ paddd m16, [t1+r10*2+416*2] ; hv sumsq
+ paddd m17, [t1+r10*2+416*4]
+ mova [t0+r10*2+416*0], m0
+ mova [t0+r10*2+416*2], m2
+ mova [t0+r10*2+416*4], m3
+ pmulld m16, m8 ; -a * 9
+ pmulld m17, m8
+ punpcklwd m0, m4, m1 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m16, [t1+r10*2+416*2]
+ mova m17, [t1+r10*2+416*4]
+ paddd m16, m16
+ paddd m17, m17
+ paddd m16, [t2+r10*2+416*2] ; hv sumsq
+ paddd m17, [t2+r10*2+416*4]
+ pmulld m16, m8 ; -a * 9
+ pmulld m17, m8
+ mova m1, [t1+r10*2+416*0]
+ paddw m1, m1
+ paddw m1, [t2+r10*2+416*0] ; hv sum
+ punpcklwd m0, m4, m1 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+ mov t4, t3
+ add t3, 416*4
+.prep_n_loop:
+ mova m2, [t5+r10*4+0]
+ mova m3, [t4+r10*4+0]
+ paddd m2, [t5+r10*4+8]
+ paddd m3, [t4+r10*4+8]
+ paddd m0, m2, [t5+r10*4+4]
+ paddd m1, m3, [t4+r10*4+4]
+ pslld m0, 2
+ paddd m1, m1 ; ab[ 0] 222
+ psubd m0, m2 ; ab[-1] 343
+ mova [t3+r10*4+416*4], m1
+ paddd m1, m1
+ mova [t5+r10*4], m0
+ psubd m1, m3 ; ab[ 0] 343
+ mova [t4+r10*4], m1
+ add r10, 16
+ jl .prep_n_loop
+ ret
+; a+b are packed together in a single dword, but we can't do the
+; full neighbor calculations before splitting them since we don't
+; have sufficient precision. The solution is to do the calculations
+; in two equal halves and split a and b before doing the final sum.
+ALIGN function_align
+.n: ; neighbor + output
+ mov r10, wq
+.n_loop:
+ mova m16, [t3+r10*4+ 0]
+ paddd m16, [t3+r10*4+ 8]
+ paddd m17, m16, [t3+r10*4+ 4]
+ paddd m17, m17 ; ab[+1] 222
+ mova m2, [t3+r10*4+416*4+ 0]
+ paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
+ mova m3, [t3+r10*4+416*4+64]
+ paddd m1, m3, [t5+r10*4+64]
+ mova [t3+r10*4+416*4+ 0], m17
+ paddd m17, m17
+ psubd m17, m16 ; ab[+1] 343
+ mova [t5+r10*4+ 0], m17
+ paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343
+ mova m16, [t3+r10*4+64]
+ paddd m16, [t3+r10*4+72]
+ paddd m17, m16, [t3+r10*4+68]
+ paddd m17, m17
+ mova [t3+r10*4+416*4+64], m17
+ paddd m17, m17
+ psubd m17, m16
+ mova [t5+r10*4+64], m17
+ pandn m16, m13, m0
+ psrld m0, 12
+ paddd m3, m17
+ pandn m17, m13, m2
+ psrld m2, 12
+ paddd m16, m17 ; a
+ pandn m17, m13, m1
+ psrld m1, 12
+ paddd m0, m2 ; b + (1 << 8)
+ pandn m2, m13, m3
+ psrld m3, 12
+ paddd m17, m2
+ pmovzxbd m2, [dstq+r10+ 0]
+ paddd m1, m3
+ pmovzxbd m3, [dstq+r10+16]
+ pmaddwd m16, m2 ; a * src
+ pmaddwd m17, m3
+ packssdw m2, m3
+ psubd m0, m16 ; b - a * src + (1 << 8)
+ psubd m1, m17
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m15
+ paddw m0, m2
+ packuswb m0, m0
+ vpermd m16, m9, m0
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n_loop
+ mov r10, t5
+ mov t5, t4
+ mov t4, r10
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m5, [sgr_shuf+1]
+ add lpfq, wq
+ vbroadcasti128 m6, [sgr_shuf+9]
+ add dstq, wq
+ vbroadcasti128 m7, [sgr_shuf+3]
+ lea t3, [rsp+wq*4+416*24+8]
+ vbroadcasti128 m8, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m9, [pd_m9]
+ vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
+ vpbroadcastd m14, [pw_61448]
+ vpsubd m12, m4, [paramsq+4] {1to16} ; -s1
+ vpbroadcastd m26, [paramsq+8] ; w0 w1
+ lea t1, [rsp+wq*2+12]
+ vpbroadcastd m10, [pd_m25]
+ neg wq
+ vpbroadcastd m13, [pw_164_455]
+ mov r10d, 0xfe
+ vpbroadcastd m15, [pd_34816]
+ kmovb k1, r10d
+ mova m20, [sgr_x_by_x+64*0]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*1]
+ kmovq k2, r10
+ mova m22, [sgr_x_by_x+64*2]
+ lea r12, [r_ext_mask+75]
+ mova m23, [sgr_x_by_x+64*3]
+ vpbroadcastd m24, [pd_m4096]
+ vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____
+ psllw m26, 5
+ mova xm27, [sgr_mix_perm]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*12]
+ lea r10, [wq-2]
+.top_fixup_loop:
+ mova m0, [t1+r10*2+416* 0]
+ mova m1, [t1+r10*2+416* 2]
+ mova m2, [t1+r10*2+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10*2+416* 6]
+ paddd m1, m1
+ mova m16, [t1+r10*2+416* 8]
+ paddd m2, m2
+ mova m17, [t1+r10*2+416*10]
+ mova [t2+r10*2+416* 0], m0
+ mova [t2+r10*2+416* 2], m1
+ mova [t2+r10*2+416* 4], m2
+ mova [t2+r10*2+416* 6], m3
+ mova [t2+r10*2+416* 8], m16
+ mova [t2+r10*2+416*10], m17
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsums
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m3, m17, m5
+ pshufb m18, m17, m6
+ shufps m0, m3, m18, q2121
+ pmullw m2, m0, m0
+ pshufb m19, m17, m7
+ paddw m0, m19
+ pshufb m17, m8
+ paddw m0, m17 ; sum3
+ punpcklwd m16, m19, m17
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m16, m16 ; sumsq3
+ punpckhwd m19, m17
+ punpckhwd m2, m4
+ vpdpwssd m2, m19, m19
+ mova [t1+r10*2+416* 6], m0
+ mova [t1+r10*2+416* 8], m1
+ mova [t1+r10*2+416*10], m2
+ punpcklwd m19, m3, m18
+ paddw m0, m3
+ vpdpwssd m1, m19, m19 ; sumsq5
+ punpckhwd m3, m18
+ paddw m0, m18 ; sum5
+ vpdpwssd m2, m3, m3
+ mova [t1+r10*2+416* 0], m0
+ mova [t1+r10*2+416* 2], m1
+ mova [t1+r10*2+416* 4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu ym17, [lpfq+r10-2]
+.hv0_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -34
+ jl .hv0_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv0_have_right:
+ pshufb m18, m17, m5
+ pshufb m19, m17, m6
+ shufps m1, m18, m19, q2121
+ pmullw m3, m1, m1
+ pshufb m0, m17, m7
+ paddw m1, m0
+ pshufb m17, m8
+ paddw m1, m17 ; sum3
+ punpcklwd m16, m0, m17
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16 ; sumsq3
+ punpckhwd m0, m17
+ punpckhwd m3, m4
+ vpdpwssd m3, m0, m0
+ paddw m0, m1, [t1+r10*2+416* 6]
+ paddd m16, m2, [t1+r10*2+416* 8]
+ paddd m17, m3, [t1+r10*2+416*10]
+ mova [t1+r10*2+416* 6], m1
+ mova [t1+r10*2+416* 8], m2
+ mova [t1+r10*2+416*10], m3
+ paddw m1, m18
+ paddw m1, m19 ; sum5
+ mova [t3+r10*4+416*8+ 8], m1
+ paddw m1, [t1+r10*2+416* 0]
+ mova [t1+r10*2+416* 0], m1
+ punpcklwd m1, m18, m19
+ vpdpwssd m2, m1, m1 ; sumsq5
+ punpckhwd m18, m19
+ vpdpwssd m3, m18, m18
+ mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row
+ mova [t3+r10*4+416*0+72], m3 ; in case height is odd
+ paddd m2, [t1+r10*2+416* 2]
+ paddd m3, [t1+r10*2+416* 4]
+ mova [t1+r10*2+416* 2], m2
+ mova [t1+r10*2+416* 4], m3
+ paddw m1, m0, [t2+r10*2+416* 6]
+ paddd m2, m16, [t2+r10*2+416* 8]
+ paddd m3, m17, [t2+r10*2+416*10]
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416* 8], m16
+ mova [t2+r10*2+416*10], m17
+ pmulld m16, m2, m9 ; -a3 * 9
+ pmulld m17, m3, m9
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m22
+ paddusw m17, m14
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*4+ 8], m16
+ mova [t3+r10*4+416*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
+ mova [t3+r10*4+416*4+ 72], m17
+ vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*4+104], m16, 3
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu ym17, [lpfq+r10-2]
+.hv1_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -34
+ jl .hv1_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv1_have_right:
+ pshufb m3, m17, m5
+ pshufb m19, m17, m6
+ shufps m2, m3, m19, q2121
+ pmullw m1, m2, m2
+ pshufb m18, m17, m7
+ paddw m2, m18
+ pshufb m17, m8
+ paddw m2, m17 ; sum3
+ punpcklwd m16, m17, m18
+ punpcklwd m0, m1, m4
+ vpdpwssd m0, m16, m16 ; sumsq3
+ punpckhwd m17, m18
+ punpckhwd m1, m4
+ vpdpwssd m1, m17, m17
+ paddd m16, m0, [t2+r10*2+416* 8]
+ paddd m17, m1, [t2+r10*2+416*10]
+ mova [t2+r10*2+416* 8], m0
+ mova [t2+r10*2+416*10], m1
+ punpcklwd m18, m3, m19
+ vpdpwssd m0, m18, m18 ; sumsq5
+ punpckhwd m18, m3, m19
+ vpdpwssd m1, m18, m18
+ paddw m3, m19
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ paddd m18, m0, [t2+r10*2+416*2]
+ paddd m19, m1, [t2+r10*2+416*4]
+ paddd m18, [t1+r10*2+416*2]
+ paddd m19, [t1+r10*2+416*4]
+ mova [t2+r10*2+416*2], m0
+ mova [t2+r10*2+416*4], m1
+ pmulld m18, m10 ; -a5 * 25
+ pmulld m19, m10
+ paddw m1, m2, [t2+r10*2+416* 6]
+ mova [t2+r10*2+416* 6], m2
+ paddw m2, m3 ; sum5
+ paddw m3, m2, [t2+r10*2+416*0]
+ paddw m3, [t1+r10*2+416*0]
+ mova [t2+r10*2+416*0], m2
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ punpcklwd m2, m3, m4 ; b5
+ vpdpwssd m18, m2, m2 ; -p5
+ punpckhwd m3, m4
+ vpdpwssd m19, m3, m3
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmulld m18, m11 ; p5 * s0
+ pmulld m19, m11
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ pmaddwd m2, m13 ; b5 * 164
+ pmaddwd m3, m13
+ vpalignr m17{k2}, m16, m16, 2
+ vpalignr m19{k2}, m18, m18, 2
+ paddusw m17, m14
+ mova m16, m22
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ paddusw m19, m14
+ mova m18, m22
+ psraw m19, 4 ; min(z5, 255) - 256
+ vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k4, m19
+ vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ vmovdqu8 m19{k4}, m18 ; x5
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ pandn m18, m24, m19
+ psrld m19, 16
+ pmulld m2, m18
+ pmulld m3, m19
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*8+ 8], m16
+ mova [t3+r10*4+416*8+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
+ paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m3, m15
+ mova [t3+r10*4+416*8+ 72], m17
+ vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*8+104], m16, 3
+ vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
+ vpternlogd m19, m3, m24, 0xd8
+ mova [t3+r10*4+416*0+ 8], m18
+ mova [t3+r10*4+416*0+ 24], xm19
+ vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
+ mova [t3+r10*4+416*0+ 72], m19
+ vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
+ vextracti32x4 [t3+r10*4+416*0+104], m18, 3
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-2]
+.v0_loop:
+ mova m2, [t1+r10*2+416* 8]
+ mova m3, [t1+r10*2+416*10]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2, [t2+r10*2+416* 8]
+ paddd m17, m3, [t2+r10*2+416*10]
+ mova m0, [t1+r10*2+416* 6]
+ paddw m0, m0
+ paddw m1, m0, [t2+r10*2+416* 6]
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416* 8], m2
+ mova [t2+r10*2+416*10], m3
+ mova m2, [t1+r10*2+416*0]
+ mova m3, [t1+r10*2+416*2]
+ mova m18, [t1+r10*2+416*4]
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ mova [t3+r10*4+416*8+ 8], m2
+ mova [t3+r10*4+416*0+ 8], m3
+ mova [t3+r10*4+416*0+72], m18
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m22
+ paddusw m17, m14
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddw m2, m2 ; cc5
+ paddd m3, m3
+ paddd m18, m18
+ mova [t1+r10*2+416*0], m2
+ mova [t1+r10*2+416*2], m3
+ mova [t1+r10*2+416*4], m18
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*4+ 8], m16
+ mova [t3+r10*4+416*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
+ mova [t3+r10*4+416*4+ 72], m17
+ vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*4+104], m16, 3
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+.v1_loop:
+ mova m0, [t1+r10*2+416* 8]
+ paddd m16, m0, [t2+r10*2+416* 8]
+ mova m1, [t1+r10*2+416*10]
+ paddd m17, m1, [t2+r10*2+416*10]
+ mova m2, [t3+r10*4+416*0+ 8]
+ paddd m18, m2, [t2+r10*2+416* 2]
+ mova m3, [t3+r10*4+416*0+72]
+ paddd m19, m3, [t2+r10*2+416* 4]
+ paddd m18, [t1+r10*2+416* 2]
+ paddd m19, [t1+r10*2+416* 4]
+ mova [t2+r10*2+416* 8], m0
+ mova [t2+r10*2+416*10], m1
+ mova [t2+r10*2+416* 2], m2
+ mova [t2+r10*2+416* 4], m3
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ pmulld m18, m10 ; -a5 * 25
+ pmulld m19, m10
+ mova m0, [t1+r10*2+416* 6]
+ paddw m1, m0, [t2+r10*2+416* 6]
+ mova m2, [t3+r10*4+416*8+ 8]
+ paddw m3, m2, [t2+r10*2+416*0]
+ paddw m3, [t1+r10*2+416*0]
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416*0], m2
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ punpcklwd m2, m3, m4 ; b5
+ vpdpwssd m18, m2, m2 ; -p5
+ punpckhwd m3, m4
+ vpdpwssd m19, m3, m3
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmulld m18, m11 ; p5 * s0
+ pmulld m19, m11
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ pmaddwd m2, m13 ; b5 * 164
+ pmaddwd m3, m13
+ vpalignr m17{k2}, m16, m16, 2
+ vpalignr m19{k2}, m18, m18, 2
+ paddusw m17, m14
+ mova m16, m22
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ paddusw m19, m14
+ mova m18, m22
+ psraw m19, 4 ; min(z5, 255) - 256
+ vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k4, m19
+ vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ vmovdqu8 m19{k4}, m18 ; x5
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ pandn m18, m24, m19
+ psrld m19, m19, 16
+ pmulld m2, m18
+ pmulld m3, m19
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*8+ 8], m16
+ mova [t3+r10*4+416*8+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
+ paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m3, m15
+ mova [t3+r10*4+416*8+ 72], m17
+ vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*8+104], m16, 3
+ vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
+ vpternlogd m19, m3, m24, 0xd8
+ mova [t3+r10*4+416*0+ 8], m18
+ mova [t3+r10*4+416*0+ 24], xm19
+ vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
+ mova [t3+r10*4+416*0+ 72], m19
+ vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
+ vextracti32x4 [t3+r10*4+416*0+104], m18, 3
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+416*0+4]
+ paddd m1, m0, [t3+r10*4+416*0+0]
+ mova m16, [t3+r10*4+416*4+0]
+ paddd m1, [t3+r10*4+416*0+8]
+ mova m17, [t3+r10*4+416*8+0]
+ paddd m16, [t3+r10*4+416*4+8]
+ paddd m17, [t3+r10*4+416*8+8]
+ paddd m2, m16, [t3+r10*4+416*4+4]
+ paddd m3, m17, [t3+r10*4+416*8+4]
+ paddd m0, m1
+ pslld m1, 2
+ pslld m2, 2
+ paddd m1, m0 ; ab5 565
+ paddd m3, m3 ; ab3[ 0] 222
+ psubd m2, m16 ; ab3[-1] 343
+ mova [t3+r10*4+416*20], m3
+ pandn m0, m24, m1 ; a5 565
+ mova [t3+r10*4+416*24], m2
+ psrld m1, 12 ; b5 565
+ mova [t3+r10*4+416*12], m0
+ paddd m3, m3
+ mova [t3+r10*4+416*16], m1
+ psubd m3, m17 ; ab3[ 0] 343
+ mova [t3+r10*4+416*28], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m2, [t3+r10*4+4]
+ paddd m3, m2, [t3+r10*4+0]
+ paddd m3, [t3+r10*4+8]
+ mova m1, [t3+r10*4+416*4+0]
+ paddd m2, m3
+ pslld m3, 2
+ paddd m1, [t3+r10*4+416*4+8]
+ paddd m3, m2
+ pandn m2, m24, m3
+ psrld m3, 12
+ paddd m0, m2, [t3+r10*4+416*12] ; a5
+ paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
+ mova [t3+r10*4+416*12], m2
+ mova [t3+r10*4+416*16], m3
+ paddd m2, m1, [t3+r10*4+416*4+4]
+ paddd m2, m2 ; ab3[ 1] 222
+ mova m3, [t3+r10*4+416*20]
+ paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+416*20], m2
+ paddd m2, m2
+ psubd m2, m1 ; ab3[ 1] 343
+ mova [t3+r10*4+416*24], m2
+ paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m1, m24, m17
+ psrld m17, 12
+ pandn m3, m24, m2
+ psrld m2, 12
+ paddd m1, m3 ; a3
+ pmovzxbd m3, [dstq+r10]
+ paddd m17, m2 ; b3 + (1 << 8)
+ pmaddwd m0, m3 ; a5 * src
+ pmaddwd m1, m3 ; a3 * src
+ vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
+ psubd m16, m0 ; b5 - a5 * src + (1 << 8)
+ psubd m17, m1 ; b3 - a3 * src + (1 << 8)
+ psrld m16, 9
+ pslld m17, 7
+ vmovdqu8 m17{k2}, m16
+ vpdpwssd m3, m17, m26
+ packuswb m3, m2
+ vpermb m16, m27, m3
+ mova [dstq+r10], xm16
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m1, [t3+r10*4+416*8+0]
+ paddd m1, [t3+r10*4+416*8+8]
+ paddd m2, m1, [t3+r10*4+416*8+4]
+ paddd m2, m2 ; ab3[ 1] 222
+ mova m0, [t3+r10*4+416*20]
+ paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
+ pmovzxbd m3, [dstq+r10]
+ mova [t3+r10*4+416*20], m2
+ paddd m2, m2
+ psubd m2, m1 ; ab3[ 1] 343
+ mova [t3+r10*4+416*28], m2
+ paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m1, m24, m17
+ psrld m17, 12
+ pandn m2, m24, m0
+ psrld m0, 12
+ paddd m1, m2 ; a3
+ paddd m17, m0 ; b3 + (1 << 8)
+ mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7)
+ pmaddwd m1, m3 ; a3 * src
+ pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src
+ vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
+ psubd m17, m1 ; b3 - a3 * src + (1 << 8)
+ psubd m16, m0 ; b5 - a5 * src + (1 << 7)
+ pslld m17, 7
+ palignr m17{k2}, m16, m16, 1
+ vpdpwssd m3, m17, m26
+ packuswb m3, m3
+ vpermb m16, m27, m3
+ mova [dstq+r10], xm16
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm
new file mode 100644
index 0000000000..01eb6fa348
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@@ -0,0 +1,3681 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_right_ext_mask: times 24 db 0xff
+ times 8 db 0
+pb_1: times 16 db 1
+pb_3: times 16 db 3
+pw_256: times 8 dw 256
+pw_2056: times 8 dw 2056
+pw_m16380: times 8 dw -16380
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+ %assign pic_reg_stk_off 4
+ %xdefine PIC_reg %1
+ %if %2 == 1
+ mov [esp], %1
+ %endif
+ LEA PIC_reg, PIC_base_offset
+ %if %3 == 1
+ XCHG_PIC_REG
+ %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+ mov [esp+pic_reg_stk_off], PIC_reg
+ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+ mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym) (sym)
+%endif
+
+%macro WIENER 0
+%if ARCH_X86_64
+DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ %define tmpstrideq strideq
+ %define base 0
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m8, [wiener_shufA]
+ pshufd m12, m14, q2222 ; x0 x0
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+%else
+ mova m10, [pw_m16380]
+ punpcklwd m14, m14
+ pshufd m11, m14, q0000 ; x0
+ pshufd m12, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+DECLARE_REG_TMP 4, 0, _, 5
+%if cpuflag(ssse3)
+ %define m10 [base+wiener_shufC]
+ %define m11 [base+wiener_shufD]
+ %define stk_off 96
+%else
+ %define m10 [base+pw_m16380]
+ %define m11 [stk+96]
+ %define stk_off 112
+%endif
+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+ %define edgeb byte edged
+ %define edged [stk+ 8]
+ %define dstmp [stk+12]
+ %define hd dword [stk+16]
+ %define wq [stk+20]
+ %define strideq [stk+24]
+ %define leftmp [stk+28]
+ %define t2 [stk+32]
+ %define t4 [stk+36]
+ %define t5 [stk+40]
+ %define t6 [stk+44]
+ %define m8 [base+wiener_shufA]
+ %define m9 [base+wiener_shufB]
+ %define m12 [stk+48]
+ %define m13 [stk+64]
+ %define m14 [stk+80]
+ %define m15 [base+pw_2056]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m3, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m3, [base+wiener_init]
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q3333
+ punpcklqdq m3, m3
+%else
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m11, m0
+%endif
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; y0 y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ add t3, tmpstrideq
+ mov [rsp], t3 ; below
+ mov t4, t1
+ add t1, 384*2
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+.v2:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ jmp .v1
+.extend_right:
+ movd m2, [lpfq-4]
+%if ARCH_X86_64
+ push r0
+ lea r0, [pb_right_ext_mask+21]
+ movu m0, [r0+xq+0]
+ movu m1, [r0+xq+8]
+ pop r0
+%else
+ movu m0, [r6+xq+0]
+ movu m1, [r6+xq+8]
+%endif
+%if cpuflag(ssse3)
+ pshufb m2, [base+pb_3]
+%else
+ punpcklbw m2, m2
+ pshuflw m2, m2, q3333
+ punpcklqdq m2, m2
+%endif
+ pand m4, m0
+ pand m5, m1
+ pandn m0, m2
+ pandn m1, m2
+ por m4, m0
+ por m5, m1
+ ret
+.h:
+ %define stk esp+4 ; offset due to call
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+%macro %%h7 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m8
+ pmaddubsw m0, m12
+ pshufb m1, m5, m8
+ pmaddubsw m1, m12
+ pshufb m2, m4, m9
+ pmaddubsw m2, m13
+ pshufb m3, m5, m9
+ pmaddubsw m3, m13
+ paddw m0, m2
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ paddw m1, m3
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m0, m2
+ mova m2, [base+pw_m16380]
+ paddw m1, m3
+ paddw m4, m2
+ paddw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 1
+ pslldq m1, m4, 1
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ psrldq m1, m4, 2
+ pslldq m2, m4, 2
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m12
+ paddw m0, m1
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m10
+ paddsw m0, m2
+ psrldq m1, m5, 1
+ pslldq m2, m5, 1
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m11
+ psrldq m2, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m2, m3
+ punpckhbw m4, m3
+ paddw m2, m4
+ pmullw m2, m12
+ paddw m1, m2
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m10
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h7
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ %%h7
+%if ARCH_X86_64
+ mova m2, [t4+xq*2]
+ paddw m2, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m2, [r2+xq*2]
+ mov r2, t2
+ paddw m2, [r2+xq*2]
+ mov r2, t5
+%endif
+ mova m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m5, [t5+xq*2]
+%else
+ mova m5, [r2+xq*2]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ paddw m4, m0, [t6+xq*2]
+%else
+ paddw m4, m0, [r2+xq*2]
+ mov r2, t4
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m0, m3
+ mova m3, [t3+xq*2+16]
+ paddd m4, m2
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+ mova m5, [t5+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t5
+ mova m5, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ paddw m4, m1, [t6+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, r1
+%endif
+ ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+ mov xq, wq
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+xq*2]
+ paddw m1, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m1, [r2+xq*2]
+ mov r2, t2
+ paddw m1, [r2+xq*2]
+ mov r2, t6
+%endif
+ mova m2, [t3+xq*2]
+ mova m4, [t1+xq*2]
+%if ARCH_X86_64
+ paddw m3, m4, [t6+xq*2]
+ paddw m4, [t5+xq*2]
+%else
+ paddw m3, m4, [r2+xq*2]
+ mov r2, t5
+ paddw m4, [r2+xq*2]
+ mov r2, t4
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m6
+ punpckhwd m3, m4
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ mova m3, [t3+xq*2+16]
+ mova m5, [t1+xq*2+16]
+%if ARCH_X86_64
+ paddw m4, m5, [t6+xq*2+16]
+ paddw m5, [t5+xq*2+16]
+%else
+ paddw m4, m5, [r2+xq*2+16]
+ mov r2, t5
+ paddw m5, [r2+xq*2+16]
+ movifnidn dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+%endif
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ ret
+%endif
+
+%if ARCH_X86_64
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ mova m8, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+ mova m12, [wiener_l_shuf]
+%else
+ punpcklwd m14, m14
+ pshufd m11, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+%if cpuflag(ssse3)
+ %define stk_off 80
+%else
+ %define m11 [stk+80]
+ %define stk_off 96
+%endif
+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+ %define m12 [base+wiener_l_shuf]
+ %define m14 [stk+48]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m2, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m2, [base+wiener_init]
+ pshufd m1, m2, q3333
+ punpcklqdq m2, m2
+%else
+ punpcklwd m2, m2
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m11, m0
+%endif
+ mova m13, m1
+ mova m14, m2
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; __ y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea xq, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ mov t3, t1
+ add t1, 384*2
+ add xq, tmpstrideq
+ mov [rsp], xq ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+.v1:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ jmp .end
+.h:
+ %define stk esp+4
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m9
+ pmaddubsw m0, m13
+ pshufb m1, m5, m9
+ pmaddubsw m1, m13
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m8
+ paddw m5, m8
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 2
+ pslldq m1, m4, 2
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m8
+ paddsw m0, m2
+ psrldq m1, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m11
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m8
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+ paddw m2, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ mova m3, [t2+xq*2]
+ paddw m4, m0, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m3, [r2+xq*2]
+ mov r2, t4
+ paddw m4, m0, [r2+xq*2]
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+xq*2+16]
+ paddw m2, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, m1, [t4+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ movifnidn dstmp, dstq
+ ret
+%if cpuflag(ssse3)
+.v:
+ mov xq, wq
+.v_loop:
+ mova m3, [t1+xq*2]
+ paddw m1, m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m2, [t2+xq*2]
+ paddw m3, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m2, [r2+xq*2]
+ mov r2, t4
+ paddw m3, [r2+xq*2]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3
+ pmaddwd m2, m6
+ punpckhwd m3, m3
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+ mova m4, [t1+xq*2+16]
+ paddw m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, [t4+xq*2+16]
+%else
+ paddw m4, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ ret
+%endif
+%endmacro
+
+INIT_XMM sse2
+WIENER
+
+INIT_XMM ssse3
+WIENER
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; self-guided ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pb_1]
+ %define m9 [esp+calloff+16*2]
+ %define m10 [base+pd_0xf00800a4]
+ %define m11 [base+sgr_lshuf5]
+ %define m12 [base+pd_34816]
+ %define m13 [base+pb_0to15]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12
+cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ movu m9, [paramsq]
+ add lpfq, wq
+ mova m8, [pb_1]
+ lea t1, [rsp+wq*2+20]
+ mova m10, [pd_0xf00800a4]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+16]
+ mova m12, [pd_34816] ; (1 << 11) + (1 << 15)
+ lea t4, [rsp+wq*2+400*20+16]
+ pshufhw m7, m9, q0000
+ pshufb m9, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pb_0to15]
+ pxor m6, m6
+ mova m11, [sgr_lshuf5]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m6
+ pshufb m3, m6
+ psubb m2, m8, m1
+ pcmpgtb m2, m13
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq*2+400*0]
+ paddd m1, [t1+wq*2+400*2]
+ paddd m2, [t1+wq*2+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq*2+400*0], m0
+ mova [t1+wq*2+400*2], m1
+ mova [t1+wq*2+400*4], m2
+ add wq, 8
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -10
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq*2+400*0] ; hv sum
+ paddd m4, [t2+wq*2+400*2] ; hv sumsq
+ paddd m5, [t2+wq*2+400*4]
+ mova [t0+wq*2+400*0], m0
+ pslld m0, m4, 4
+ mova [t0+wq*2+400*2], m2
+ mova [t0+wq*2+400*4], m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq*2+400*0], m1
+ paddw m1, m0
+ mova [t1+wq*2+400*2], m4
+ paddd m4, m2
+ mova [t1+wq*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m2, [t1+wq*2+400*2]
+ mova m3, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ pslld m0, m4, 4
+ paddd m5, m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*2+400*2+ 0]
+ paddd m4, m1, [t3+wq*4+400*4+ 0]
+ paddd m5, m2, [t3+wq*4+400*4+16]
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movq m0, [dstq+wq]
+ mova m3, [t4+wq*2+400*2+ 0]
+ mova m4, [t3+wq*4+400*4+ 0]
+ mova m5, [t3+wq*4+400*4+16]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pb_0to15]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 m6
+ %define m13 [base+sgr_lshuf3]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ mov hd, hm
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq*2+12]
+ mova m8, [pb_0to15]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [sgr_lshuf3]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m0, [lpfq-1]
+ movd m1, wd
+ mova m3, m8
+ pshufb m0, m6
+ pshufb m1, m6
+ mova m2, m6
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m5, m2
+ pandn m2, m0
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -9
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -9
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -9
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m2
+ mova [t2+wq*2+400*4], m3
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 4]
+ movu m1, [t3+wq*4+400*0+ 8]
+ movu m2, [t3+wq*4+400*0+24]
+ movu m3, [t4+wq*2+400*0+ 2]
+ movu m4, [t3+wq*4+400*0+ 4]
+ movu m5, [t3+wq*4+400*0+20]
+ paddw m0, [t4+wq*2+400*0+ 0]
+ paddd m1, [t3+wq*4+400*0+ 0]
+ paddd m2, [t3+wq*4+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*4], m3
+ mova [t3+wq*4+400*8+ 0], m4
+ mova [t3+wq*4+400*8+16], m5
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400* 6], m3
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*2+400*0+4]
+ movu m1, [t4+wq*2+400*0+2]
+ paddw m3, [t4+wq*2+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*4]
+ paddw m3, [t4+wq*2+400*6]
+ mova [t4+wq*2+400*4], m2
+ mova [t4+wq*2+400*6], m1
+ movu m4, [t3+wq*4+400*0+8]
+ movu m1, [t3+wq*4+400*0+4]
+ paddd m4, [t3+wq*4+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400* 8+ 0]
+ paddd m4, [t3+wq*4+400*12+ 0]
+ mova [t3+wq*4+400* 8+ 0], m2
+ mova [t3+wq*4+400*12+ 0], m1
+ movu m5, [t3+wq*4+400*0+24]
+ movu m1, [t3+wq*4+400*0+20]
+ paddd m5, [t3+wq*4+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400* 8+16]
+ paddd m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400* 8+16], m2
+ mova [t3+wq*4+400*12+16], m1
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*2+4]
+ movu m1, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*6]
+ paddw m3, [t4+wq*2+400*8]
+ mova [t4+wq*2+400*6], m1
+ mova [t4+wq*2+400*8], m2
+ movu m4, [t3+wq*4+400*4+8]
+ movu m1, [t3+wq*4+400*4+4]
+ paddd m4, [t3+wq*4+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400*12+ 0]
+ paddd m4, [t3+wq*4+400*16+ 0]
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*16+ 0], m2
+ movu m5, [t3+wq*4+400*4+24]
+ movu m1, [t3+wq*4+400*4+20]
+ paddd m5, [t3+wq*4+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400*12+16]
+ paddd m5, [t3+wq*4+400*16+16]
+ mova [t3+wq*4+400*12+16], m1
+ mova [t3+wq*4+400*16+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_0xffff]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ mova m15, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_0xffff]
+ lea t1, [rsp+wq*2+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*24+40]
+ mova m11, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*52+40]
+ mova m12, [base+pd_0xf00800a4]
+ neg wq
+ pshuflw m13, m15, q0000
+ pshuflw m14, m15, q2222
+ pshufhw m15, m15, q1010
+ punpcklqdq m13, m13 ; s0
+ punpcklqdq m14, m14 ; s1
+ punpckhqdq m15, m15 ; w0 w1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 2
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400* 0]
+ mova m1, [t1+wq*2+400* 2]
+ mova m2, [t1+wq*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq*2+400*10]
+ mova [t2+wq*2+400* 0], m0
+ mova [t2+wq*2+400* 2], m1
+ mova [t2+wq*2+400* 4], m2
+ mova [t2+wq*2+400* 6], m3
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+%if ARCH_X86_64
+ SWAP m8, m6
+%endif
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m8
+ pshufb m3, m8
+ psubb m2, [base+pb_1], m1
+ pcmpgtb m2, [base+pb_0to15]
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+ pmaddwd m0, m0
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv0_have_right
+ cmp wd, -10
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*4+400*8+ 8], m8
+ mova [t3+wq*4+400*0+ 8], m7
+ mova [t3+wq*4+400*0+24], m5
+ paddw m8, [t1+wq*2+400* 0]
+ paddd m7, [t1+wq*2+400* 2]
+ paddd m5, [t1+wq*2+400* 4]
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ paddw m0, m1, [t1+wq*2+400* 6]
+ paddd m4, m2, [t1+wq*2+400* 8]
+ paddd m5, m3, [t1+wq*2+400*10]
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv1_have_right
+ cmp wd, -10
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m8
+ pmaddwd m1, m1
+ punpckhwd m3, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq*2+400* 6]
+ mova [t2+wq*2+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 8], m0
+ mova [t2+wq*2+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; a3 * 9
+ paddd m3, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ punpcklwd m0, m5, m8 ; b3
+ pmaddwd m7, m0, m0
+ punpckhwd m5, m8
+ pmaddwd m8, m5, m5
+ psubd m2, m7 ; p3
+ psubd m3, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*2+400*4+ 4], m8
+ mova [t3+wq*4+400*8+ 8], m0
+ mova [t3+wq*4+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m1, [t2+wq*2+400*4]
+ paddw m5, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m8
+ pslld m0, m2, 4
+ mova [t2+wq*2+400*2], m4
+ pslld m8, m3, 4
+ mova [t2+wq*2+400*4], m1
+ pslld m4, m2, 3
+ paddd m2, m0
+ pslld m7, m3, 3
+ paddd m3, m8
+ paddd m2, m4 ; a5 * 25
+ paddd m3, m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m5, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m5, m7
+ pmaddwd m1, m5, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m2, m4 ; p5
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*2+4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m5
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400* 6]
+ mova m4, [t1+wq*2+400* 8]
+ mova m5, [t1+wq*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ mova [t3+wq*4+400*8+ 8], m3
+ mova [t3+wq*4+400*0+ 8], m4
+ mova [t3+wq*4+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq*2+400*0], m3
+ mova [t1+wq*2+400*2], m4
+ mova [t1+wq*2+400*4], m5
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq*2+400* 6]
+ mova m5, [t1+wq*2+400* 8]
+ mova m7, [t1+wq*2+400*10]
+ paddw m1, m4, [t2+wq*2+400* 6]
+ paddd m2, m5, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m4
+ mova [t2+wq*2+400* 8], m5
+ mova [t2+wq*2+400*10], m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*4+400*8+ 8]
+ mova m5, [t3+wq*4+400*0+ 8]
+ mova m7, [t3+wq*4+400*0+24]
+ paddw m1, m4, [t2+wq*2+400*0]
+ paddd m2, m5, [t2+wq*2+400*2]
+ paddd m3, m7, [t2+wq*2+400*4]
+ paddw m1, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m4
+ mova [t2+wq*2+400*2], m5
+ mova [t2+wq*2+400*4], m7
+ pslld m4, m2, 4
+ mova [t3+wq*4+400*8+ 8], m0
+ pslld m5, m3, 4
+ mova [t3+wq*4+400*8+24], m8
+ pslld m7, m2, 3
+ paddd m2, m4
+ pslld m8, m3, 3
+ paddd m3, m5
+ paddd m2, m7 ; a5 * 25
+ paddd m3, m8
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m1, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ psubd m3, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 2]
+ movu m1, [t3+wq*4+400*0+ 4]
+ movu m2, [t3+wq*4+400*0+20]
+ movu m7, [t4+wq*2+400*0+ 4]
+ movu m8, [t3+wq*4+400*0+ 8]
+ paddw m3, m0, [t4+wq*2+400*0+ 0]
+ paddd m4, m1, [t3+wq*4+400*0+ 0]
+ paddd m5, m2, [t3+wq*4+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*4+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*2+400* 6+ 0], m0
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*12+16], m2
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8+ 0], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ movu m0, [t4+wq*2+400*4+ 4]
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m2, [t3+wq*4+400*8+24]
+ movu m3, [t4+wq*2+400*4+ 2]
+ movu m4, [t3+wq*4+400*8+ 4]
+ movu m5, [t3+wq*4+400*8+20]
+ paddw m0, [t4+wq*2+400*4+ 0]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m2, [t3+wq*4+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400*10+ 0], m3
+ mova [t3+wq*4+400*20+ 0], m4
+ mova [t3+wq*4+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*12+ 0], m3
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 4]
+ movu m2, [t4+wq*2+ 2]
+ paddw m0, [t4+wq*2+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*4+ 8]
+ movu m5, [t3+wq*4+24]
+ movu m1, [t3+wq*4+ 4]
+ movu m3, [t3+wq*4+20]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*2+400* 6]
+ paddw m2, m0
+ mova [t4+wq*2+400* 6], m0
+ paddd m0, m4, [t3+wq*4+400*12+ 0]
+ paddd m1, m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*2+400*2+4]
+ movu m5, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*2+400* 8]
+ paddw m3, [t4+wq*2+400*10]
+ paddw m3, m4
+ mova [t4+wq*2+400* 8], m4
+ mova [t4+wq*2+400*10], m5
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m5, [t3+wq*4+400*4+ 4]
+ movu m7, [t3+wq*4+400*4+24]
+ movu m8, [t3+wq*4+400*4+20]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m7, [t3+wq*4+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*4+400*16+ 0]
+ paddd m7, m8, [t3+wq*4+400*16+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m8
+ mova [t3+wq*4+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*4+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movq m4, [dstq+wq]
+ punpcklbw m4, m6
+ punpcklwd m5, m4, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m5 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m5 ; a3 * src
+ punpckhwd m5, m4, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m9
+ pandn m8, m9, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ psubd m1, m2
+ mova m2, [rsp+32+ARCH_X86_32*4]
+ psubd m2, m3
+ mova m3, [base+pd_4096]
+ psrld m1, 9
+ pslld m2, 7
+ pand m1, m9
+ pandn m5, m9, m2
+ por m1, m5
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ paddd m0, m3
+ paddd m1, m3
+ psrad m0, 13
+ psrad m1, 13
+ packssdw m0, m1
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*4+4]
+ movu m5, [t4+wq*2+400*4+2]
+ paddw m3, [t4+wq*2+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*2+400*12]
+ paddw m3, [t4+wq*2+400*10]
+ mova [t4+wq*2+400*10], m5
+ mova [t4+wq*2+400*12], m4
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m5, [t3+wq*4+400*8+ 4]
+ movu m7, [t3+wq*4+400*8+24]
+ movu m8, [t3+wq*4+400*8+20]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m7, [t3+wq*4+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*4+400*24+ 0]
+ paddd m7, m0, [t3+wq*4+400*24+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*20+ 0], m5
+ mova [t3+wq*4+400*20+16], m8
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m0
+ movq m5, [dstq+wq]
+ mova m2, [t4+wq*2+400* 6]
+ punpcklbw m5, m6
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m4, m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m4
+ punpckhwd m3, m6
+ pmaddwd m3, m4
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ mova m0, [t3+wq*4+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ mova m4, [t3+wq*4+400*12+16]
+ psubd m4, m2
+ psubd m7, m3
+ pslld m1, 7
+ psrld m0, 8
+ psrld m4, 8
+ pslld m7, 7
+ pandn m3, m9, m1
+ pand m0, m9
+ por m0, m3
+ pand m4, m9
+ pandn m2, m9, m7
+ por m2, m4
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+ paddd m0, m1
+ paddd m2, m1
+ psrad m0, 13
+ psrad m2, 13
+ packssdw m0, m2
+ paddw m0, m5
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
diff --git a/third_party/dav1d/src/x86/mc.h b/third_party/dav1d/src/x86/mc.h
new file mode 100644
index 0000000000..65c607e180
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/mc.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(name, sse2)); \
+ decl_##type##_fn(BF(name, ssse3)); \
+ decl_##type##_fn(BF(name, avx2)); \
+ decl_##type##_fn(BF(name, avx512icl));
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+#define init_mc_scaled_fn(type, name, suffix) \
+ c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
+
+decl_fn(mc, dav1d_put_8tap_regular);
+decl_fn(mc, dav1d_put_8tap_regular_smooth);
+decl_fn(mc, dav1d_put_8tap_regular_sharp);
+decl_fn(mc, dav1d_put_8tap_smooth);
+decl_fn(mc, dav1d_put_8tap_smooth_regular);
+decl_fn(mc, dav1d_put_8tap_smooth_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp_regular);
+decl_fn(mc, dav1d_put_8tap_sharp_smooth);
+decl_fn(mc, dav1d_put_bilin);
+
+decl_fn(mct, dav1d_prep_8tap_regular);
+decl_fn(mct, dav1d_prep_8tap_regular_smooth);
+decl_fn(mct, dav1d_prep_8tap_regular_sharp);
+decl_fn(mct, dav1d_prep_8tap_smooth);
+decl_fn(mct, dav1d_prep_8tap_smooth_regular);
+decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp_regular);
+decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
+decl_fn(mct, dav1d_prep_bilin);
+
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
+decl_fn(mc_scaled, dav1d_put_bilin_scaled);
+
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
+decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
+
+decl_fn(avg, dav1d_avg);
+decl_fn(w_avg, dav1d_w_avg);
+decl_fn(mask, dav1d_mask);
+decl_fn(w_mask, dav1d_w_mask_420);
+decl_fn(w_mask, dav1d_w_mask_422);
+decl_fn(w_mask, dav1d_w_mask_444);
+decl_fn(blend, dav1d_blend);
+decl_fn(blend_dir, dav1d_blend_v);
+decl_fn(blend_dir, dav1d_blend_h);
+
+decl_fn(warp8x8, dav1d_warp_affine_8x8);
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
+decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
+
+decl_fn(emu_edge, dav1d_emu_edge);
+
+decl_fn(resize, dav1d_resize);
+
+static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+ return;
+
+#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
+#endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ c->avg = BF(dav1d_avg, ssse3);
+ c->w_avg = BF(dav1d_w_avg, ssse3);
+ c->mask = BF(dav1d_mask, ssse3);
+ c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
+ c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
+ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
+ c->blend = BF(dav1d_blend, ssse3);
+ c->blend_v = BF(dav1d_blend_v, ssse3);
+ c->blend_h = BF(dav1d_blend_h, ssse3);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
+ c->emu_edge = BF(dav1d_emu_edge, ssse3);
+ c->resize = BF(dav1d_resize, ssse3);
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ c->avg = BF(dav1d_avg, avx2);
+ c->w_avg = BF(dav1d_w_avg, avx2);
+ c->mask = BF(dav1d_mask, avx2);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+ c->blend = BF(dav1d_blend, avx2);
+ c->blend_v = BF(dav1d_blend_v, avx2);
+ c->blend_h = BF(dav1d_blend_h, avx2);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
+ c->emu_edge = BF(dav1d_emu_edge, avx2);
+ c->resize = BF(dav1d_resize, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ c->avg = BF(dav1d_avg, avx512icl);
+ c->w_avg = BF(dav1d_w_avg, avx512icl);
+ c->mask = BF(dav1d_mask, avx512icl);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
+ c->blend = BF(dav1d_blend, avx512icl);
+ c->blend_v = BF(dav1d_blend_v, avx512icl);
+ c->blend_h = BF(dav1d_blend_h, avx512icl);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ c->resize = BF(dav1d_resize, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
new file mode 100644
index 0000000000..61eeaa1007
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -0,0 +1,5879 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+; dav1d_obmc_masks[] * -512
+const obmc_masks_avx2
+ dw 0, 0, -9728, 0, -12800, -7168, -2560, 0
+ dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0
+ dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120
+ dw -4096, -3072, -2048, -1536, 0, 0, 0, 0
+ dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
+ dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608
+ dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024
+ dw 0, 0, 0, 0, 0, 0, 0, 0
+
+deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+prep_mul: dw 16, 16, 4, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+put_8tap_h_rnd: dd 34, 40
+s_8tap_h_rnd: dd 2, 8
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_rnd: dd 512, 128
+put_s_8tap_v_sh: dd 10, 8
+prep_8tap_1d_rnd: dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: dd 32 - (8192 << 5)
+warp8x8t_rnd: dd 16384 - (8192 << 15)
+warp8x8_shift: dd 5, 3
+warp8x8_rnd: dw 4096, 4096, 16384, 16384
+bidir_rnd: dw -16400, -16400, -16388, -16388
+bidir_mul: dw 2048, 2048, 8192, 8192
+
+%define pw_16 prep_mul
+%define pd_512 put_s_8tap_v_rnd
+
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pw_2048: times 2 dw 2048
+pw_8192: times 2 dw 8192
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+pw_m512: times 2 dw -512
+pd_32: dd 32
+pd_63: dd 63
+pd_64: dd 64
+pd_32768: dd 32768
+pd_65538: dd 65538
+pd_m524256: dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: dd 0x3ff
+pq_0x40000000: dq 0x40000000
+ dd 0
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 8
+ %define org_w r8d
+ mov r8d, wd
+%else
+ DECLARE_REG_TMP 7
+ %define org_w wm
+%endif
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+INIT_YMM avx2
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ movu m0, [srcq+32*4]
+ movu m1, [srcq+32*5]
+ movu m2, [srcq+32*6]
+ movu m3, [srcq+32*7]
+ add srcq, ssq
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add wq, r7
+ shr r6d, 11
+ vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
+ jmp wq
+.h_w2:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ movq xm1, [srcq+ssq*0+2]
+ movhps xm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti128 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m5, [srcq+32*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m5, [srcq+32*1+2]
+ add srcq, ssq
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w64_loop0:
+ mov r6d, t0d
+.h_w64_loop:
+ pmullw m0, m4, [srcq+r6*2-32*1]
+ pmullw m1, m5, [srcq+r6*2-32*1+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r6*2-32*2]
+ pmullw m2, m5, [srcq+r6*2-32*2+2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2-32*1], m0
+ mova [dstq+r6*2-32*2], m1
+ sub r6d, 32
+ jg .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64_loop0
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ movd xm5, mxyd
+ add wq, r7
+ vpbroadcastw m5, xm5
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xm2, xm0, xm1
+ movd xm0, [srcq+ssq*0]
+ punpckldq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xm2, xm0, xm1
+ movq xm0, [srcq+ssq*0]
+ punpcklqdq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m0, m1, 0xf0
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m1, m0, 0xf0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+.v_w32_loop:
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m5
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+32*0]
+ mova [dstq+dsq*0+32*0], m4
+ psubw m4, m3, m1
+ pmulhrsw m4, m5
+ paddw m4, m1
+ movu m1, [srcq+ssq*0+32*1]
+ mova [dstq+dsq*0+32*1], m4
+ psubw m4, m0, m2
+ pmulhrsw m4, m5
+ paddw m4, m2
+ mova [dstq+dsq*1+32*0], m4
+ psubw m4, m1, m3
+ pmulhrsw m4, m5
+ paddw m4, m3
+ mova [dstq+dsq*1+32*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w16:
+.v_w64:
+.v_w128:
+ movifnidn t0d, org_w
+ add t0d, t0d
+ mov r4, srcq
+ lea r6d, [hq+t0*8-256]
+ mov r7, dstq
+.v_w16_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w16_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ vpbroadcastd m3, [pw_2]
+ movd xm6, mxyd
+ vpbroadcastd m7, [pw_8192]
+ add wq, r7
+ vpbroadcastw m6, xm6
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m7, [pw_2048]
+.hv_12bpc:
+ jmp wq
+.hv_w2:
+ vpbroadcastq xm1, [srcq+ssq*0]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w2_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm2, [srcq+ssq*0]
+ pmullw xm1, xm4, xm2
+ psrlq xm2, 16
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 _ 2 _
+ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xm0, xm4, [srcq+ssq*0-8]
+ pmullw xm1, xm5, [srcq+ssq*0-6]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ movq xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*0+2]
+ pmullw xm1, xm4
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 2
+ shufpd xm2, xm0, xm1, 0x01 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+ssq*0]
+ pmullw xm1, xm5, [srcq+ssq*0+2]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ vinserti128 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if UNIX64
+ lea r6d, [r8*2-32]
+%else
+ mov r6d, wm
+ lea r6d, [r6*2-32]
+%endif
+ mov r4, srcq
+ lea r6d, [hq+r6*8]
+ mov r7, dstq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 7
+ %define org_w r7d
+%else
+ DECLARE_REG_TMP 6
+ %define org_w r5m
+%endif
+ mov org_w, wd
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx2+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0+32*0]
+ pmullw m1, m4, [srcq+strideq*0+32*1]
+ pmullw m2, m4, [srcq+strideq*1+32*0]
+ pmullw m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmullw m0, m4, [srcq+32*4]
+ pmullw m1, m4, [srcq+32*5]
+ pmullw m2, m4, [srcq+32*6]
+ pmullw m3, m4, [srcq+32*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m0, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*0+2]
+ vinserti128 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w32_loop0:
+ mov r3d, t0d
+.h_w32_loop:
+ pmullw m0, m4, [srcq+r3*2-32*1]
+ pmullw m1, m5, [srcq+r3*2-32*1+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r3*2-32*2]
+ pmullw m2, m5, [srcq+r3*2-32*2+2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+r3*2-32*1], m0
+ mova [tmpq+r3*2-32*2], m1
+ sub r3d, 32
+ jg .h_w32_loop
+ add srcq, strideq
+ lea tmpq, [tmpq+t0*2]
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ movd xm5, mxyd
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq xm1, [srcq+strideq*1]
+ vpblendd m2, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0xf0 ; 1 1 3 3
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m1, m2, 0x33 ; 0 1 2 3
+ vpblendd m0, m2, 0x0c ; 4 2 4 4
+ punpckhqdq m2, m1, m0 ; 1 2 3 4
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m1, m0, m2, 0xf0 ; 0 1
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vpblendd m2, m0, 0xf0 ; 1 2
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+32*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.v_w32_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+r7*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+r7*1], m1
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w4:
+ movu xm1, [srcq+strideq*0]
+%if WIN64
+ movaps [rsp+24], xmm7
+%endif
+ pmullw xm0, xm4, xm1
+ psrldq xm1, 2
+ pmullw xm1, xm5
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vpbroadcastq m0, xm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ punpcklqdq m7, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m7, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m7, m3
+ paddw m1, m7
+ psraw m1, 2 ; 1 2 3 4
+ vpblendd m0, m1, 0x3f
+ vpermq m2, m0, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+%if WIN64
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ movu xm2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m1, [srcq+strideq*0], 1
+ vinserti128 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq]
+ pmullw m1, m5, [srcq+2]
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+r7*0], m2
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r7*1], m2
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova xm2, [subpel_h_shuf2]
+ vpbroadcastd xm3, [base+subpel_filters+mxq*8+2]
+ pmovsxbw xm3, xm3
+.h_w2_loop:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm2
+ pshufb xm1, xm2
+ pmaddwd xm0, xm3
+ pmaddwd xm1, xm3
+ phaddd xm0, xm1
+ paddd xm0, xm4
+ psrad xm0, 6
+ packusdw xm0, xm0
+ pminsw xm0, xm5
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm3, [base+subpel_filters+mxq*8]
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ pshufd xm3, xm3, q2211
+ vpbroadcastq m2, xm3
+ vpermq m3, m3, q1111
+.h_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1
+ pminsw xm0, xm5
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m5, r8m
+ shr r7d, 11
+ vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 13
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m4
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m4
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 6
+ psrad m%2, 6
+ packusdw m%1, m%2
+ pminsw m%1, m5
+%endmacro
+ movu xm0, [srcq+ssq*0+ 0]
+ vinserti128 m0, [srcq+ssq*1+ 0], 1
+ movu xm2, [srcq+ssq*0+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ shufpd m1, m0, m2, 0x05
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6*2-32]
+ movu m1, [srcq+r6*2-24]
+ movu m2, [srcq+r6*2-16]
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+r6*2-32], m0
+ sub r6d, 16
+ jg .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m6, [pd_32]
+ vpbroadcastw m7, r8m
+ lea r6, [ssq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklwd xm3, xm1 ; 45 56
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xm4, [srcq+ssq*0]
+ pmaddwd xm5, xm8, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm9 ; a1 b1
+ paddd xm5, xm6
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm10 ; a2 b2
+ paddd xm5, xm3
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklwd xm3, xm4 ; 67 78
+ pmaddwd xm4, xm11, xm3 ; a3 b3
+ paddd xm5, xm4
+ psrad xm5, 6
+ packusdw xm5, xm5
+ pminsw xm5, xm7
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m0, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m5, [srcq+ssq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 6
+ vextracti128 xm4, m5, 1
+ packusdw xm5, xm4
+ pminsw xm5, xm7
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ shl wd, 5
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ lea srcq, [srcq+ssq*4]
+ vbroadcasti128 m1, [srcq+ssq*0]
+ vbroadcasti128 m2, [srcq+ssq*1]
+ vbroadcasti128 m3, [srcq+ssq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+ssq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packusdw m12, m13
+ pxor m13, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ vpermq m12, m12, q3120
+ mova [dstq+dsq*0], xm12
+ vextracti128 [dstq+dsq*1], m12, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w8_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastw m15, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ vpbroadcastd m6, [pd_512]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_10bit
+ psraw m7, 2
+ psllw m1, 2
+.hv_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m9, [subpel_h_shuf2]
+ vbroadcasti128 m1, [srcq+r6 ] ; 3 3
+ movu xm3, [srcq+ssq*2]
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m3, [srcq+ssq*0], 1 ; 2 4
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 1 6
+ add srcq, r6
+ pshufb m1, m9
+ pshufb m3, m9
+ pshufb m0, m9
+ pshufb m2, m9
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ phaddd m1, m3
+ phaddd m0, m2
+ paddd m1, m6
+ paddd m0, m6
+ psrad m1, 10
+ psrad m0, 10
+ packssdw m1, m0 ; 3 2 0 1
+ vextracti128 xm0, m1, 1 ; 3 4 5 6
+ pshufd xm2, xm1, q1301 ; 2 3 1 2
+ pshufd xm3, xm0, q2121 ; 4 5 4 5
+ punpckhwd xm1, xm2 ; 01 12
+ punpcklwd xm2, xm0 ; 23 34
+ punpckhwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm4, xm9
+ pshufb xm5, xm9
+ pmaddwd xm4, xm7
+ pmaddwd xm5, xm7
+ phaddd xm4, xm5
+ pmaddwd xm5, xm11, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm12 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm13 ; a2 b2
+ paddd xm5, xm3
+ paddd xm4, xm6
+ psrad xm4, 10
+ packssdw xm4, xm4
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm14, xm3 ; a3 b3
+ paddd xm5, xm6
+ paddd xm5, xm4
+ psrad xm5, 10
+ packusdw xm5, xm5
+ pminsw xm5, xm15
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 4
+ movu xm3, [srcq+ssq*1]
+ vinserti128 m3, [srcq+ssq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m6
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m6
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m6
+ paddd m4, m0
+ paddd m5, m6
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 10
+ psrld m2, 10
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 6
+ pslld m5, 6
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 6
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 10
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+ssq*0]
+ vinserti128 m4, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m6
+ paddd m4, m3
+ psrad m4, 10
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 10
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, xm15
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+ shl wd, 5
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ pxor m0, m0
+ punpcklbw m0, m2
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+ test dword r8m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+ psllw xm1, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+%if WIN64
+ %define v_mul (rsp+stack_offset+40) ; r4m
+%else
+ %define v_mul (rsp-24) ; red zone
+%endif
+ mova [v_mul], xm1
+.hv_w8_loop0:
+%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m10
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m10
+ paddd m2, m3
+ paddd m%3, m2
+ paddd m%2, m%3
+ psrad m%1, 10
+ psrad m%2, 10
+ packssdw m%1, m%2
+%endmacro
+ movu xm4, [srcq+r6 *1+ 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 *1+ 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 *1+16]
+ vpbroadcastd m10, [pd_512]
+ movu xm5, [srcq+ssq*0+ 0]
+ vinserti128 m5, [srcq+ssq*4+ 0], 1
+ movu xm1, [srcq+ssq*0+16]
+ vinserti128 m1, [srcq+ssq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PUT_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PUT_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+ssq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+ssq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PUT_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+ssq*1+ 0]
+ movu xm1, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m6, [srcq+ssq*1+ 0], 1
+ vinserti128 m1, [srcq+ssq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PUT_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+ssq*0]
+ vinserti128 m5, [srcq+ssq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+ssq*0+16]
+ vinserti128 m6, [srcq+ssq*1+16], 1
+ vextracti128 [dstq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m6, m5
+ movu xm5, [srcq+ssq*0+8]
+ vinserti128 m5, [srcq+ssq*1+8], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ vpbroadcastd m10, [pd_512]
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [dstq]
+ paddd m8, m10
+ paddd m9, m10
+ paddd m0, m10
+ paddd m5, m10
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 10
+ psrad m5, 10
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 10
+ psrad m9, 10
+ packusdw m7, m9
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+%define base r7-prep_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx2]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r6d, r7m ; bitdepth_max
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ vpbroadcastd m5, [r7-prep_avx2+pw_8192]
+ shr r6d, 11
+ add wq, r7
+ vpbroadcastd m4, [base+prep_mul+r6*4]
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m3, [subpel_h_shufA]
+ vbroadcasti128 m4, [subpel_h_shufB]
+ WIN64_SPILL_XMM 8
+ pshufd xm0, xm0, q2211
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw xm0, 2
+.h_w4_12bpc:
+ vpbroadcastq m6, xm0
+ vpermq m7, m0, q1111
+.h_w4_loop:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m0, 2
+.h_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m5
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m5
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 4
+ psrad m%2, 4
+ packssdw m%1, m%2
+%endmacro
+ movu xm0, [srcq+strideq*0+ 0]
+ vinserti128 m0, [srcq+strideq*1+ 0], 1
+ movu xm2, [srcq+strideq*0+16]
+ vinserti128 m2, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ shufpd m1, m0, m2, 0x05
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ add wd, wd
+.h_w16_loop0:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6-32]
+ movu m1, [srcq+r6-24]
+ movu m2, [srcq+r6-16]
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq+r6-32], m0
+ sub r6d, 32
+ jg .h_w16_loop
+ add srcq, strideq
+ add tmpq, wq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m7, [prep_8tap_1d_rnd]
+ lea r6, [strideq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m0, 2
+.v_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+.v_w4:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m0, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+strideq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+strideq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m7
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 4
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ mova [tmpq], xm5
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if WIN64
+ push r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m3, [srcq+strideq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+strideq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m7
+ paddd m13, m7
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ vpermq m12, m12, q3120
+ mova [tmpq+r8*0], xm12
+ vextracti128 [tmpq+r8*2], m12, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .v_w8_loop0
+%if WIN64
+ pop r8
+%endif
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastd m15, [prep_8tap_2d_rnd]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m7, 4
+ psraw m1, 8
+ test dword r7m, 0x800
+ jz .hv_w4_10bit
+ psraw m7, 2
+.hv_w4_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 4
+ movu xm3, [srcq+strideq*1]
+ vinserti128 m3, [srcq+strideq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m15
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m15
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m15
+ paddd m4, m0
+ paddd m5, m15
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 6
+ psrld m2, 6
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 10
+ pslld m5, 10
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 10
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 6
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+strideq*0]
+ vinserti128 m4, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m15
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m15
+ paddd m4, m3
+ psrad m4, 6
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+%if WIN64
+ PUSH r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ lea r6, [strideq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+ pxor m0, m0
+ punpcklbw m0, m2
+ mova [v_mul], xm1
+ psraw m0, 4
+ test dword r7m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+.hv_w8_loop0:
+%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m15
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m15
+ paddd m2, m3
+ paddd m2, m%3
+ paddd m2, m%2
+ psrad m%1, 6
+ psrad m2, 6
+ packssdw m%1, m2
+%endmacro
+ movu xm4, [srcq+r6 + 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 + 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 +16]
+ movu xm5, [srcq+strideq*0+ 0]
+ vinserti128 m5, [srcq+strideq*4+ 0], 1
+ movu xm1, [srcq+strideq*0+16]
+ vinserti128 m1, [srcq+strideq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PREP_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PREP_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+strideq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+strideq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PREP_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+strideq*1+ 0]
+ movu xm1, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m6, [srcq+strideq*1+ 0], 1
+ vinserti128 m1, [srcq+strideq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PREP_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m15
+ paddd m9, m15
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+strideq*0]
+ vinserti128 m5, [srcq+strideq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+strideq*0+16]
+ vinserti128 m6, [srcq+strideq*1+16], 1
+ vextracti128 [tmpq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m15
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m5, m15
+ paddd m6, m5
+ movu xm5, [srcq+strideq*0+8]
+ vinserti128 m5, [srcq+strideq*1+8], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [tmpq]
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 6
+ psrad m5, 6
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 6
+ psrad m9, 6
+ packssdw m7, m9
+ vpermq m7, m7, q3120
+ mova [tmpq+r8*0], xm7
+ vextracti128 [tmpq+r8*2], m7, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+%if WIN64
+ POP r8
+%endif
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
+ movu xm%1, [srcq+ r4*2]
+ movu xm%2, [srcq+ r6*2]
+ movu xm%3, [srcq+ r7*2]
+ movu xm%4, [srcq+ r9*2]
+ vinserti128 m%1, [srcq+r10*2], 1
+ vinserti128 m%2, [srcq+r11*2], 1
+ vinserti128 m%3, [srcq+r13*2], 1
+ vinserti128 m%4, [srcq+ rX*2], 1
+ add srcq, ssq
+ movu xm%5, [srcq+ r4*2]
+ movu xm%6, [srcq+ r6*2]
+ movu xm%7, [srcq+ r7*2]
+ movu xm%8, [srcq+ r9*2]
+ vinserti128 m%5, [srcq+r10*2], 1
+ vinserti128 m%6, [srcq+r11*2], 1
+ vinserti128 m%7, [srcq+r13*2], 1
+ vinserti128 m%8, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m%1, m12
+ pmaddwd m%2, m13
+ pmaddwd m%3, m14
+ pmaddwd m%4, m15
+ pmaddwd m%5, m12
+ pmaddwd m%6, m13
+ pmaddwd m%7, m14
+ pmaddwd m%8, m15
+ phaddd m%1, m%2
+ %if %9
+ mova m10, [rsp+0x00]
+ %endif
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, m10
+ paddd m%5, m10
+ psrad m%1, xm11
+ psrad m%5, xm11
+ packssdw m%1, m%5
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine base_reg r12
+ mov r7d, pxmaxm
+%else
+ %assign isput 0
+ %assign isprep 1
+cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [rsp+0xd0]
+ %xdefine base_reg r11
+%endif
+ lea base_reg, [%1_8tap_scaled_16bpc_avx2]
+%define base base_reg-%1_8tap_scaled_16bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm10, mxd
+ vpbroadcastd m10, xm10
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ mov r6d, pxmaxm
+%else
+ vpbroadcastd m10, mxm
+ %if isput
+ vpbroadcastw m11, pxmaxm
+ %else
+ mov r6d, pxmaxm
+ %endif
+%endif
+ mov dyd, dym
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+0x98]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x98]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ shr r7d, 11
+ vpbroadcastd m6, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4]
+ movd xm7, [base+s_8tap_h_sh+r7*4]
+%if isput
+ vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4]
+ pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2
+%else
+ vpbroadcastd m13, [base+pd_m524256]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0,1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1 ; 0 4
+ vinserti128 m1, [srcq+ssq*1], 1 ; 1 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 2 6
+ vinserti128 m3, [srcq+ss3q ], 1 ; 3 7
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m10}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm14, r6q
+ pmovsxbw xm14, xm14
+ pshufd xm8, xm14, q0000
+ pshufd xm9, xm14, q1111
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pshufd xm8, xm14, q2222
+ pshufd xm14, xm14, q3333
+ paddd xm5, xm6
+ pmaddwd xm6, xm2, xm8
+ pmaddwd xm8, xm4, xm14
+ psrldq xm9, xm7, 8
+ paddd xm5, xm6
+ paddd xm5, xm13
+ paddd xm5, xm8
+ psrad xm5, xm9
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq], xm5
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movu xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm10
+ pmaddwd xm5, xm15
+ phaddd xm5, xm5
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movu xm6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm10
+ pshufb xm6, xm10
+ pmaddwd xm5, xm15
+ pmaddwd xm6, xm15
+ phaddd xm5, xm6
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ mova [rsp+0x00], m12
+%if isput
+ mova [rsp+0x20], xm13
+%else
+ SWAP m11, m13
+%endif
+ mova [rsp+0x30], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m0, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ movu xm1, [srcq+r4 ]
+ movu xm3, [srcq+r6 ]
+ movu xm2, [srcq+r11 ]
+ movu xm4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ vinserti128 m1, [srcq+r4 ], 1
+ vinserti128 m3, [srcq+r6 ], 1
+ vinserti128 m2, [srcq+r11 ], 1
+ vinserti128 m4, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m0
+ paddb m13, m0
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m14}, m7, m9, m8, m10
+ REPX {pshufb x, m13}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x30]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1 4 5
+ packssdw m8, m10 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm10, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm10 ; 67
+ mova [rsp+0x40], xm7
+ mova [rsp+0x50], xm8
+ mova [rsp+0x60], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq xm9, r11q
+ pmovsxbw xm9, xm9
+ pshufd xm7, xm9, q0000
+ pshufd xm8, xm9, q1111
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pshufd xm7, xm9, q2222
+ pshufd xm9, xm9, q3333
+ pmaddwd xm6, xm2, xm7
+ pmaddwd xm8, xm3, xm9
+%if isput
+ mova xm7, [rsp+0x20]
+ movd xm9, [rsp+0x38]
+%else
+ SWAP m7, m11
+%endif
+ paddd xm4, xm5
+ paddd xm6, xm8
+ paddd xm4, xm6
+ paddd xm4, xm7
+%if isput
+ psrad xm4, xm9
+ packusdw xm4, xm4
+ pminuw xm4, xm11
+ movq [dstq], xm4
+ add dstq, dsq
+%else
+ SWAP m11, m7
+ psrad xm4, 6
+ packssdw xm4, xm4
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova xm8, [rsp+0x00]
+ movd xm9, [rsp+0x30]
+ movu xm4, [srcq]
+ movu xm5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x40]
+ mova [rsp+0x40], xm1
+ mova xm1, [rsp+0x50]
+ mova [rsp+0x50], xm2
+ mova xm2, [rsp+0x60]
+ mova [rsp+0x60], xm3
+ pshufb xm4, xm12
+ pshufb xm5, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm5, xm15
+ phaddd xm4, xm5
+ paddd xm4, xm8
+ psrad xm4, xm9
+ packssdw xm4, xm4
+ punpcklwd xm3, xm10, xm4
+ mova xm10, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm6, [srcq+ssq*1]
+ movu xm7, [srcq+r6]
+ movu m0, [rsp+0x50]
+ pshufb xm4, xm12
+ pshufb xm6, xm12
+ pshufb xm5, xm13
+ pshufb xm7, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm6, xm14
+ pmaddwd xm5, xm15
+ pmaddwd xm7, xm15
+ mova [rsp+0x40], m0
+ phaddd xm4, xm5
+ phaddd xm6, xm7
+ paddd xm4, xm8
+ paddd xm6, xm8
+ psrad xm4, xm9
+ psrad xm6, xm9
+ packssdw xm4, xm6
+ punpcklwd xm9, xm10, xm4
+ mova [rsp+0x60], xm9
+ psrldq xm10, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+ SWAP m10, m13
+%if isprep
+ SWAP m13, m11
+%endif
+.w8:
+ mov dword [rsp+0x80], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x80], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x80], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x80], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x80], 16
+ movifprep tmp_stridem, 256
+.w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xb0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ shr t0d, 16
+ sub srcq, 6
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0x84], t0d
+ mov [rsp+0x88], srcq
+ mov [rsp+0x90], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x80]
+ jz .ret
+ add qword [rsp+0x90], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x40]
+ vpbroadcastd m15, [rsp+0x84]
+ pxor m9, m9
+ mov srcq, [rsp+0x88]
+ mov r0q, [rsp+0x90] ; dstq / tmpq
+.hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x40], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq [rsp+0xa0], xm1
+ movq [rsp+0xa8], xm7
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x60], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x60]
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m9 ; 01a 01b
+ pshufb m1, m9 ; 23a 23b
+ pshufb m2, m9 ; 45a 45b
+ pshufb m3, m9 ; 67a 67b
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm9, r6q
+ punpcklqdq xm9, xm9
+ pmovsxbw m9, xm9
+ pshufd m8, m9, q0000
+ pshufd m7, m9, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m7
+ pshufd m8, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m9
+%if isput
+ psrldq xm8, xm11, 8
+%endif
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, xm8
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xb0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x60], myd
+ mov r4d, [rsp+0xa0]
+ mov r6d, [rsp+0xa4]
+ mov r7d, [rsp+0xa8]
+ mov r9d, [rsp+0xac]
+ jz .skip_line
+ vbroadcasti128 m9, [base+wswap]
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m0, m9
+ pshufb m1, m9
+ pshufb m2, m9
+ pshufb m3, m9
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, m10
+ psrad m4, xm11
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m3, m9
+ jmp .vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*1], 1
+ vinserti128 m2, [srcq+ssq*2], 1
+ add srcq, ss3q
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ REPX {pshufb x, m10}, m0, m1, m2
+ pshufb xm3, xm10
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ pmaddwd xm3, xm15
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm1, xm10
+ pshufb xm5, xm10
+ pmaddwd xm1, xm15
+ pmaddwd xm5, xm15
+ phaddd xm1, xm5
+ pmaddwd xm5, xm3, xm8
+ mova xm3, xm0
+ pmaddwd xm0, xm9
+ paddd xm1, xm12
+ psrad xm1, xm7
+ packssdw xm1, xm1
+ paddd xm5, xm0
+ mova xm0, xm2
+ pmaddwd xm2, xm14
+ paddd xm5, xm2
+ palignr xm2, xm1, xm4, 12
+ punpcklwd xm2, xm1 ; 67 78
+ pmaddwd xm4, xm2, xm6
+ paddd xm5, xm13
+ paddd xm5, xm4
+ mova xm4, xm1
+ psrldq xm1, xm7, 8
+ psrad xm5, xm1
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*2]
+ lea r11, [r4+ssq*1]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*2]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ movu xm3, [srcq+ssq*2] ; 6 _
+ movu xm10, [srcq+r6 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r11 ], 1
+ lea srcq, [srcq+ss3q ]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pshufb m2, m12
+ pshufb xm3, xm12
+ pmaddwd m2, m14
+ pmaddwd xm3, xm14
+ pshufb m9, m13
+ pshufb xm10, xm13
+ pmaddwd m9, m15
+ pmaddwd xm10, xm15
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ phaddd xm3, xm10
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ paddd xm3, xm5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ psrad xm3, xm6
+ vperm2i128 m4, m0, m1, 0x21 ; 1 2
+ vperm2i128 m5, m1, m2, 0x21 ; 3 4
+ vperm2i128 m6, m2, m3, 0x21 ; 5 6
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pslld m4, 16
+ pslld m5, 16
+ pslld m6, 16
+ pblendw m0, m4, 0xaa ; 01 12
+ pblendw m1, m5, 0xaa ; 23 34
+ pblendw m2, m6, 0xaa ; 45 56
+ movq xm10, r13q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ vinserti128 m11, [srcq+ssq*1], 1
+ vinserti128 m6, [srcq+r11 ], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufb m11, m12
+ pshufb m6, m13
+ pmaddwd m11, m14
+ pmaddwd m6, m15
+ paddd m4, [rsp+0x20]
+ phaddd m11, m6
+ pmaddwd m6, m2, m9
+ paddd m11, [rsp+0x00]
+ psrad m11, [rsp+0x40]
+ mova m0, m1
+ mova m1, m2
+ paddd m5, m6
+ paddd m4, m5
+ vinserti128 m2, m3, xm11, 1
+ pslld m3, m11, 16
+ pblendw m2, m3, 0xaa ; 67 78
+ pmaddwd m5, m2, m10
+ vextracti128 xm3, m11, 1
+ paddd m4, m5
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy1_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ %define dsm [rsp+0xb8]
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%else
+ %if UNIX64
+ %define hm [rsp+0xb8]
+ %endif
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy1_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ vbroadcasti128 m7, [base+wswap]
+ pshufb m0, m7
+ pshufb m1, m7
+ pshufb m2, m7
+ pshufb m3, m7
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, [rsp+0x00]
+ psrad m4, [rsp+0x40]
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*2]
+ movu xm2, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m0, m10
+ pshufb m1, m10
+ pshufb m2, m10
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ pmaddwd m2, m15
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ phaddd m0, m1
+ phaddd m1, m2
+ paddd m0, m12
+ paddd m1, m12
+ psrad m0, xm7
+ psrad m1, xm7
+ packssdw m0, m1 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m0, 1
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ punpcklwd xm2, xm0, xm1 ; 01 23
+ punpckhwd xm1, xm0, xm1 ; 23 45
+.dy2_w2_loop:
+ movu xm3, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m3, [srcq+ssq*1], 1 ; 6 7
+ vinserti128 m5, [srcq+ss3q ], 1 ; 8 9
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm2, xm8
+ pmaddwd xm1, xm9
+ pshufb m3, m10
+ pshufb m5, m10
+ pmaddwd m3, m15
+ pmaddwd m5, m15
+ phaddd m3, m5
+ paddd xm4, xm1
+ paddd m3, m12
+ psrad m3, xm7
+ packssdw m3, m3
+ pshufd m3, m3, q2100
+ palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9
+ vextracti128 xm1, m0, 1
+ punpcklwd xm2, xm0, xm1 ; 45 67
+ punpckhwd xm1, xm0, xm1 ; 67 89
+ pmaddwd xm3, xm2, xm14
+ pmaddwd xm5, xm1, xm6
+ paddd xm4, xm13
+ paddd xm4, xm3
+ psrldq xm3, xm7, 8
+ paddd xm4, xm5
+ psrad xm4, xm3
+ packusdw xm4, xm4
+ pminsw xm4, xm11
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pcmpeqd m6, m9
+ punpckldq m11, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ movq xm10, r13q
+ pblendvb m14, m2, m11
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*1]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*2], 1 ; 0 2
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 1 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r6 ], 1
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pshufb m2, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pshufb m9, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pmaddwd m9, m15
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ vperm2i128 m3, m0, m2, 0x21 ; 2 4
+ vperm2i128 m2, m1, 0x13 ; 3 5
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ packssdw m0, m3 ; 0 2 2 4
+ packssdw m1, m2 ; 1 3 3 5
+ punpckhwd m2, m0, m1 ; 23 45
+ punpcklwd m0, m1 ; 01 23
+.dy2_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ movu xm3, [srcq+ssq*1]
+ movu xm11, [srcq+r6 ]
+ vinserti128 m1, [srcq+ssq*2], 1 ; 6 8
+ vinserti128 m6, [srcq+r11 ], 1
+ vinserti128 m3, [srcq+ss3q ], 1 ; 7 9
+ vinserti128 m11, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m2, m8
+ pshufb m1, m12
+ pshufb m3, m12
+ pmaddwd m1, m14
+ pmaddwd m3, m14
+ mova m0, [rsp+0x00]
+ pshufb m6, m13
+ pshufb m11, m13
+ pmaddwd m6, m15
+ pmaddwd m11, m15
+ paddd m4, m5
+ movd xm5, [rsp+0x40]
+ phaddd m1, m6
+ phaddd m3, m11
+ paddd m1, m0
+ paddd m3, m0
+ psrad m1, xm5
+ psrad m3, xm5
+ pslld m3, 16
+ pblendw m1, m3, 0xaa ; 67 89
+ vperm2i128 m0, m2, m1, 0x21 ; 45 67
+ paddd m4, [rsp+0x20]
+ mova m2, m1
+ pmaddwd m5, m0, m9
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy2_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy2_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movu xm3, [srcq+ r4*2]
+ movu xm4, [srcq+ r6*2]
+ movu xm5, [srcq+ r7*2]
+ movu xm6, [srcq+ r9*2]
+ vinserti128 m3, [srcq+r10*2], 1
+ vinserti128 m4, [srcq+r11*2], 1
+ vinserti128 m5, [srcq+r13*2], 1
+ vinserti128 m6, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m3, m12
+ pmaddwd m4, m13
+ pmaddwd m5, m14
+ pmaddwd m6, m15
+ phaddd m3, m4
+ phaddd m5, m6
+ phaddd m3, m5
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [rsp+0x00]
+ movd xm7, [rsp+0x40]
+ phaddd m4, m6
+ paddd m3, m5
+ paddd m4, m5
+ psrad m3, xm7
+ psrad m4, xm7
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 01, 23, 45, 67
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ punpcklwd m8, m0
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m9, m%2
+ mova m%2, m%3
+ paddd m0, m8
+ mova m%3, m%4
+ mova m%4, m%5
+ paddd m%1, m0, m9
+%endmacro
+
+cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
+ mov r6d, r7m
+ lea r9, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [warp8x8t_rnd]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m7, m14
+ paddd m0, m14
+ psrad m7, 15
+ psrad m0, 15
+ packssdw m7, m0
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+
+cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
+ alpha, beta, filter, tmp1, delta, \
+ my, gamma
+ mov r6d, r7m
+ lea filterq, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4]
+ vpbroadcastw m15, r7m ; pixel_max
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 16
+ psrad m0, 16
+ packusdw m7, m0
+ pmulhrsw m7, m14
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ vpbroadcastd m12, [pd_32768]
+ pxor m11, m11
+ add filterq, mc_warp_filter-$$
+ lea tmp1q, [ssq*3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 01
+ psrld m2, m0, 16
+ call .h
+ pblendw m2, m0, 0xaa ; 12
+ psrld m3, m0, 16
+ call .h
+ pblendw m3, m0, 0xaa ; 23
+ psrld m4, m0, 16
+ call .h
+ pblendw m4, m0, 0xaa ; 34
+ psrld m5, m0, 16
+ call .h
+ pblendw m5, m0, 0xaa ; 45
+ psrld m6, m0, 16
+ call .h
+ pblendw m6, m0, 0xaa ; 56
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m7, m6, 16
+ pblendw m7, m0, 0xaa ; 67
+ WARP_V 7, 1, 3, 5, 7
+ call .h
+ psrld m10, m5, 16
+ pblendw m10, m0, 0xaa ; 78
+ WARP_V 0, 2, 4, 6, 10
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ movu xm10, [srcq-6]
+ vinserti128 m10, [srcq+2], 1
+ shr mxd, 10 ; 0
+ shr tmp1d, 10 ; 4
+ movq xm0, [filterq+mxq *8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ movu xm8, [srcq-4]
+ vinserti128 m8, [srcq+4], 1
+ shr tmp2d, 10 ; 1
+ shr tmp1d, 10 ; 5
+ movq xm9, [filterq+tmp2q*8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10 ; 2
+ shr tmp1d, 10 ; 6
+ punpcklbw m0, m11, m0
+ pmaddwd m0, m10
+ movu xm10, [srcq-2]
+ vinserti128 m10, [srcq+6], 1
+ punpcklbw m9, m11, m9
+ pmaddwd m9, m8
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ phaddd m0, m9 ; 0 1 4 5
+ movu xm9, [srcq+0]
+ vinserti128 m9, [srcq+8], 1
+ shr tmp2d, 10 ; 3
+ shr tmp1d, 10 ; 7
+ punpcklbw m8, m11, m8
+ pmaddwd m8, m10
+ movq xm10, [filterq+tmp2q*8]
+ vinserti128 m10, [filterq+tmp1q*8], 1
+ punpcklbw m10, m11, m10
+ pmaddwd m9, m10
+ add srcq, ssq
+ phaddd m8, m9 ; 2 3 6 7
+ phaddd m0, m8 ; 0 1 2 3 4 5 6 7
+ vpsllvd m0, m13
+ paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ cmp hd, 8
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.ret:
+ RET
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ cmp hd, 4
+ jne .w8_loop_start
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+.w8_loop_start:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx2_table
+ lea r6, [avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m4, [base+bidir_rnd+t0*4]
+ vpbroadcastd m5, [base+bidir_mul+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+32*0]
+ paddsw m0, [tmp2q+32*0]
+ mova m1, [tmp1q+32*1]
+ paddsw m1, [tmp2q+32*1]
+ mova m2, [tmp1q+32*2]
+ paddsw m2, [tmp2q+32*2]
+ mova m3, [tmp1q+32*3]
+ paddsw m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ psubsw m0, m4
+ psubsw m1, m4
+ psubsw m2, m4
+ psubsw m3, m4
+ pmulhw m0, m5
+ pmulhw m1, m5
+ pmulhw m2, m5
+ pmulhw m3, m5
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [w_avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ vpbroadcastw m8, r7m ; pixel_max
+ vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538]
+ movsxd wq, [r6+wq*4]
+ paddw m7, m8
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ pslld m7, 7
+ rorx r6d, t0d, 30 ; << 2
+ test dword r7m, 0x800
+ cmovz r6d, t0d
+ movifnidn hd, hm
+ movd xm6, r6d
+ vpbroadcastd m6, xm6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m4, [tmp1q+32*0]
+ mova m0, [tmp2q+32*0]
+ punpckhwd m5, m0, m4
+ punpcklwd m0, m4
+ mova m4, [tmp1q+32*1]
+ mova m1, [tmp2q+32*1]
+ pmaddwd m5, m6
+ pmaddwd m0, m6
+ paddd m5, m7
+ paddd m0, m7
+ psrad m5, 8
+ psrad m0, 8
+ packusdw m0, m5
+ punpckhwd m5, m1, m4
+ punpcklwd m1, m4
+ mova m4, [tmp1q+32*2]
+ mova m2, [tmp2q+32*2]
+ pmaddwd m5, m6
+ pmaddwd m1, m6
+ paddd m5, m7
+ paddd m1, m7
+ psrad m5, 8
+ psrad m1, 8
+ packusdw m1, m5
+ punpckhwd m5, m2, m4
+ punpcklwd m2, m4
+ mova m4, [tmp1q+32*3]
+ mova m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaddwd m5, m6
+ pmaddwd m2, m6
+ paddd m5, m7
+ paddd m2, m7
+ psrad m5, 8
+ psrad m2, 8
+ packusdw m2, m5
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ pmaddwd m5, m6
+ pmaddwd m3, m6
+ paddd m5, m7
+ paddd m3, m7
+ psrad m5, 8
+ psrad m3, 8
+ packusdw m3, m5
+ pminsw m0, m8
+ pminsw m1, m8
+ pminsw m2, m8
+ pminsw m3, m8
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx2_table
+ lea r7, [mask_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+bidir_rnd+r6*4]
+ vpbroadcastd m10, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+%macro MASK 1
+ pmovzxbw m5, [maskq+16*%1]
+ mova m%1, [tmp1q+32*%1]
+ mova m6, [tmp2q+32*%1]
+ punpckhwd m4, m%1, m6
+ punpcklwd m%1, m6
+ psubw m7, m8, m5
+ punpckhwd m6, m5, m7 ; m, 64-m
+ punpcklwd m5, m7
+ pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m%1, m5
+ psrad m4, 5
+ psrad m%1, 5
+ packssdw m%1, m4
+ pmaxsw m%1, m9
+ psubsw m%1, m9
+ pmulhw m%1, m10
+%endmacro
+ MASK 0
+ MASK 1
+ MASK 2
+ MASK 3
+ add maskq, 16*4
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd xm0, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ movd xm14, [base+pw_2]
+ mov maskq, maskmp
+ psubw xm14, xm0
+ vpbroadcastw m14, xm14
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ phaddd m4, m5
+ paddw m4, m14
+ psrlw m4, 2
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ punpcklwd xm4, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ mova [maskq], xm4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vperm2i128 m6, m4, m5, 0x21
+ vpblendd m4, m5, 0xf0
+ paddw m4, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ mova [maskq], xm4
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ punpcklqdq m6, m4, m5
+ punpckhqdq m4, m5
+ paddw m6, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ pshufd xm4, xm4, q3120
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ mova [maskq], xm4
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m15, m4, 2
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ call .main
+ mova m6, [deint_shuf]
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m4, 2
+ packuswb m15, m4
+ vpermd m4, m6, m15
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m2
+ mova [dstq+stride3q +32*1], m3
+ mova [maskq], m4
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq], m4 ; no available registers
+ call .main
+ paddw m4, [maskq]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5 ; 0 2 4 6 1 3 5 7
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq], m4
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ paddw m4, m14
+ paddw m5, m14
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq+32*0], m4
+ mova [dstq+strideq], m5
+ call .main
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*4], m0
+ mova [dstq+strideq*0+32*5], m1
+ mova [dstq+strideq*0+32*6], m2
+ mova [dstq+strideq*0+32*7], m3
+ mova [maskq+32*1], m4
+ call .main
+ paddw m4, [maskq+32*0]
+ paddw m5, [dstq+strideq]
+ mova m6, [deint_shuf]
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq+32*0], m4
+ call .main
+ paddw m4, [maskq+32*1]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*4], m0
+ mova [dstq+strideq*1+32*5], m1
+ mova [dstq+strideq*1+32*6], m2
+ mova [dstq+strideq*1+32*7], m3
+ mova [maskq+32*1], m4
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
+ mova m%1, [tmp1q+32*%1]
+ mova m%2, [tmp2q+32*%1]
+ punpcklwd m8, m%2, m%1
+ punpckhwd m9, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m7, m10, m%1
+ psrlw m7, 10 ; 64-m
+ psubw m%2, m%3, m7 ; m
+ punpcklwd m%1, m7, m%2
+ punpckhwd m7, m%2
+ pmaddwd m%1, m8
+ pmaddwd m7, m9
+ psrad m%1, 5
+ psrad m7, 5
+ packssdw m%1, m7
+ pmaxsw m%1, m%4
+ psubsw m%1, m%4
+ pmulhw m%1, m%5
+%endmacro
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ vpbroadcastb m14, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ mova m15, [base+deint_shuf]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ packuswb m4, m5
+ pxor m5, m5
+ psubb m4, m14
+ pavgb m4, m5
+ vpermd m4, m15, m4
+ mova [maskq], m4
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m4, [base+pw_64]
+ vpbroadcastd m5, [base+bidir_rnd+r6*4]
+ vpbroadcastd m6, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ call .main
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ call .main
+ mova [dstq+32*6], m0
+ mova [dstq+32*7], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2, 4, 5, 6
+ W_MASK 1, 3, 4, 5, 6
+ packuswb m2, m3
+ vpermq m2, m2, q3120
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ mova [maskq], m2
+ add maskq, 32
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw m3, [maskq]
+ movq xm0, [dstq+dsq*0]
+ movhps xm0, [dstq+dsq*1]
+ vpbroadcastq m1, [dstq+dsq*2]
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psubw m1, m0, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmullw m3, m6
+ pmulhrsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ mova xm1, [dstq+dsq*2]
+ vinserti128 m1, [dstq+r6 ], 1
+ psubw m2, m0, [tmpq+32*0]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+32*0]
+ psubw m2, m0, [tmpq+32*0]
+ mova m1, [dstq+32*1]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+INIT_XMM avx2
+cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd m2, [base+obmc_masks_avx2+2*2]
+.w2_loop:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq m2, [base+obmc_masks_avx2+4*2]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m2, [base+obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks_avx2+16*2]
+.w16_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mova m6, [base+obmc_masks_avx2+32*2]
+ vbroadcasti128 m7, [base+obmc_masks_avx2+32*3]
+.w32_loop:
+ mova m0, [dstq+dsq*0+32*0]
+ psubw m3, m0, [tmpq +32*0]
+ mova xm2, [dstq+dsq*0+32*1]
+ mova xm5, [tmpq +32*1]
+ mova m1, [dstq+dsq*1+32*0]
+ psubw m4, m1, [tmpq +32*2]
+ vinserti128 m2, [dstq+dsq*1+32*1], 1
+ vinserti128 m5, [tmpq +32*3], 1
+ add tmpq, 32*4
+ psubw m5, m2, m5
+ pmulhrsw m3, m6
+ pmulhrsw m4, m6
+ pmulhrsw m5, m7
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*1+32*0], m1
+ mova [dstq+dsq*0+32*1], xm2
+ vextracti128 [dstq+dsq*1+32*1], m2, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+32*(%1+0)]
+ psubw m2, m0, [tmpq+32*(%2+0)]
+ mova m1, [dstq+32*(%1+1)]
+ psubw m3, m1, [tmpq+32*(%2+1)]
+%if %3
+ add tmpq, 32*%3
+%endif
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*(%1+0)], m0
+ mova [dstq+32*(%1+1)], m1
+%endmacro
+
+INIT_XMM avx2
+cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd m2, m2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m3, [blend_shuf]
+ shufpd m3, m3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 8
+ BLEND_H_ROW 4, -4
+ BLEND_H_ROW 6, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ lea srcq, [srcq+r10*2]
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastw m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, 16
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3*2]
+%if %1
+ movu [r12+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, 16
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ lea r12, [r12+centerwq*2]
+%else
+ lea r12, [dstq+centerwq*2]
+%endif
+ xor r3, r3
+ vpbroadcastw m0, [srcq+centerwq*2-2]
+.right_loop_%3:
+ movu [r12+r3*2], m0
+ add r3, 16
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_64]
+ vpbroadcastw xm7, pxmaxm
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ vpbroadcastd m10, [base+pd_63]
+ pxor m2, m2
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m10 ; filter offset (masked)
+ ; load source pixels
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movu xm10, [srcq+r8*2]
+ movu xm11, [srcq+r9*2]
+ movu xm12, [srcq+r10*2]
+ movu xm13, [srcq+r11*2]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m10, [srcq+r8*2], 1
+ vinserti128 m11, [srcq+r9*2], 1
+ vinserti128 m12, [srcq+r10*2], 1
+ vinserti128 m13, [srcq+r11*2], 1
+ ptest m1, m1
+ jz .filter
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movu xm14, [base+resize_shuf+8+r8*2]
+ movu xm15, [base+resize_shuf+8+r9*2]
+ movu xm0, [base+resize_shuf+8+r10*2]
+ movu xm2, [base+resize_shuf+8+r11*2]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+8+r8*2], 1
+ vinserti128 m15, [base+resize_shuf+8+r9*2], 1
+ vinserti128 m0, [base+resize_shuf+8+r10*2], 1
+ vinserti128 m2, [base+resize_shuf+8+r11*2], 1
+ pshufb m10, m14
+ pshufb m11, m15
+ pshufb m12, m0
+ pshufb m13, m2
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm14, [base+resize_filter+r8*8]
+ movq xm15, [base+resize_filter+r9*8]
+ movq xm0, [base+resize_filter+r10*8]
+ movq xm2, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ movhps xm14, [base+resize_filter+r8*8]
+ movhps xm15, [base+resize_filter+r9*8]
+ movhps xm0, [base+resize_filter+r10*8]
+ movhps xm2, [base+resize_filter+r11*8]
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pmovsxbw m0, xm0
+ pmovsxbw m2, xm2
+ pmaddwd m10, m14
+ pmaddwd m11, m15
+ pmaddwd m12, m0
+ pmaddwd m13, m2
+ phaddd m10, m11
+ phaddd m12, m13
+ phaddd m10, m12
+ psubd m10, m3, m10
+ psrad m10, 7
+ vextracti128 xm0, m10, 1
+ packusdw xm10, xm0
+ pminsw xm10, xm7
+ mova [dstq+xq*2], xm10
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
new file mode 100644
index 0000000000..585ba53e08
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -0,0 +1,4858 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+ db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
+spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
+ db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
+ db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
+ db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+ db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
+spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
+ db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
+ db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
+ db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
+spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
+ db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
+ db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
+prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
+ db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
+ db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
+ db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
+spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
+ db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
+spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46
+ db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
+spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
+ db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
+ db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
+ db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
+spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
+ db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
+ db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
+ db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
+spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
+ db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
+ db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
+ db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
+spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
+ db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
+ db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
+ db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
+spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
+ db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
+ db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
+spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
+spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
+ db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
+w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+ db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
+ db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
+w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+ db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
+ db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
+w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+ db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
+ db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
+warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
+ db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
+ db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
+ db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
+warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
+ db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
+ db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
+ db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
+warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+ dd 1
+pw_2048: times 2 dw 2048
+ dd 3
+pw_8192: times 2 dw 8192
+avg_shift: dw 5, 5, 3, 3
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
+resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
+resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
+resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
+resize_permE: dq 0, 2, 4, 6
+resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
+resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+
+prep_hv_shift: dq 6, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+prep_mul: dw 16, 16, 4, 4
+put_8tap_h_rnd: dd 34, 40
+prep_8tap_rnd: dd 128 - (8192 << 8)
+warp_8x8_rnd_h: dd 512, 2048
+warp_8x8_rnd_v: dd 262144, 65536
+warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
+avg_round: dw -16400, -16400, -16388, -16388
+w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
+mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
+w_mask_round: dd 128, 64
+bidir_shift: dw 6, 6, 4, 4
+
+pb_64: times 4 db 64
+pw_m512: times 2 dw -512
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pd_32: dd 32
+pd_63: dd 63
+pd_128: dd 128
+pd_640: dd 640
+pd_2176: dd 2176
+pd_16384: dd 16384
+pd_0_4: dd 0, 4
+
+%define pw_16 prep_mul
+%define pd_512 warp_8x8_rnd_h
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
+
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern obmc_masks_avx2
+cextern resize_filter
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 4
+%else
+DECLARE_REG_TMP 8
+%endif
+
+INIT_ZMM avx512icl
+cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt t0d, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx t0d, word [r7+t0*2+table_offset(put,)]
+ add t0, r7
+ jmp t0
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+ movu m3, [srcq+64*3]
+ add srcq, ssq
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add t0, r7
+ shr r6d, 11
+ vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
+ jmp t0
+.h_w2:
+ movq xmm1, [srcq+ssq*0]
+ movhps xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 4
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xmm0, [srcq+ssq*0+0]
+ movhps xmm0, [srcq+ssq*1+0]
+ movq xmm1, [srcq+ssq*0+2]
+ movhps xmm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xmm0, xm4
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 4
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0+0]
+ vinserti32x4 ym0, [srcq+ssq*1+0], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti32x4 ym1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw ym0, ym4
+ pmullw ym1, ym5
+ paddw ym0, ym6
+ paddw ym0, ym1
+ psrlw ym0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu ym0, [srcq+ssq*0+0]
+ vinserti32x8 m0, [srcq+ssq*1+0], 1
+ movu ym1, [srcq+ssq*0+2]
+ vinserti32x8 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m6
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ pmullw m1, m4, [srcq+ssq*1+0]
+ pmullw m3, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m2, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+ pmullw m3, m5, [srcq+64*1+2]
+ add srcq, ssq
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m7, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+ pmullw m8, m5, [srcq+64*1+2]
+ pmullw m2, m4, [srcq+64*2+0]
+ pmullw m9, m5, [srcq+64*2+2]
+ pmullw m3, m4, [srcq+64*3+0]
+ pmullw m10, m5, [srcq+64*3+2]
+ add srcq, ssq
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ paddw m0, m7
+ paddw m1, m8
+ paddw m2, m9
+ paddw m3, m10
+ REPX {psrlw x, 4}, m0, m1, m2, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ vpbroadcastw m8, mxyd
+ add t0, r7
+ jmp t0
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xmm2, xmm0, xmm1
+ movd xmm0, [srcq+ssq*0]
+ punpckldq xmm1, xmm0
+ psubw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ paddw xmm1, xmm2
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xmm2, xmm0, xmm1
+ movq xmm0, [srcq+ssq*0]
+ punpcklqdq xmm1, xmm0
+ psubw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ paddw xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 ymm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm2, ymm0, ymm1, 0xf0
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm1, ymm0, 0xf0
+ psubw ymm1, ymm2
+ pmulhrsw ymm1, ym8
+ paddw ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ movu ym0, [srcq+ssq*0]
+.v_w16_loop:
+ movu ym3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw ym1, ym3, ym0
+ pmulhrsw ym1, ym8
+ paddw ym1, ym0
+ movu ym0, [srcq+ssq*0]
+ psubw ym2, ym0, ym3
+ pmulhrsw ym2, ym8
+ paddw ym2, ym3
+ mova [dstq+dsq*0], ym1
+ mova [dstq+dsq*1], ym2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0]
+.v_w32_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m8
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m8
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+.v_w64_loop:
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m8
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+64*0]
+ psubw m5, m3, m1
+ pmulhrsw m5, m8
+ paddw m5, m1
+ movu m1, [srcq+ssq*0+64*1]
+ psubw m6, m0, m2
+ pmulhrsw m6, m8
+ psubw m7, m1, m3
+ pmulhrsw m7, m8
+ mova [dstq+dsq*0+64*0], m4
+ mova [dstq+dsq*0+64*1], m5
+ paddw m6, m2
+ paddw m7, m3
+ mova [dstq+dsq*1+64*0], m6
+ mova [dstq+dsq*1+64*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*0+64*2]
+ movu m3, [srcq+ssq*0+64*3]
+.v_w128_loop:
+ movu m4, [srcq+ssq*1+64*0]
+ movu m5, [srcq+ssq*1+64*1]
+ movu m6, [srcq+ssq*1+64*2]
+ movu m7, [srcq+ssq*1+64*3]
+ lea srcq, [srcq+ssq*2]
+ psubw m9, m4, m0
+ pmulhrsw m9, m8
+ paddw m9, m0
+ movu m0, [srcq+ssq*0+64*0]
+ psubw m10, m5, m1
+ pmulhrsw m10, m8
+ paddw m10, m1
+ movu m1, [srcq+ssq*0+64*1]
+ psubw m11, m6, m2
+ pmulhrsw m11, m8
+ paddw m11, m2
+ movu m2, [srcq+ssq*0+64*2]
+ psubw m12, m7, m3
+ pmulhrsw m12, m8
+ paddw m12, m3
+ movu m3, [srcq+ssq*0+64*3]
+ mova [dstq+dsq*0+64*0], m9
+ psubw m9, m0, m4
+ pmulhrsw m9, m8
+ mova [dstq+dsq*0+64*1], m10
+ psubw m10, m1, m5
+ pmulhrsw m10, m8
+ mova [dstq+dsq*0+64*2], m11
+ psubw m11, m2, m6
+ pmulhrsw m11, m8
+ mova [dstq+dsq*0+64*3], m12
+ psubw m12, m3, m7
+ pmulhrsw m12, m8
+ paddw m9, m4
+ paddw m10, m5
+ mova [dstq+dsq*1+64*0], m9
+ mova [dstq+dsq*1+64*1], m10
+ paddw m11, m6
+ paddw m12, m7
+ mova [dstq+dsq*1+64*2], m11
+ mova [dstq+dsq*1+64*3], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastd m6, [pw_2]
+ vpbroadcastw m7, mxyd
+ vpbroadcastd m8, [pw_8192]
+ add t0, r7
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m8, [pw_2048]
+.hv_12bpc:
+ jmp t0
+.hv_w2:
+ vpbroadcastq xmm1, [srcq+ssq*0]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+.hv_w2_loop:
+ movq xmm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm2, [srcq+ssq*0]
+ pmullw xmm1, xmm2, xm4
+ psrlq xmm2, 16
+ pmullw xmm2, xm5
+ paddw xmm1, xm6
+ paddw xmm1, xmm2
+ psrlw xmm1, 2 ; 1 _ 2 _
+ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm7
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xmm0, xm4, [srcq+ssq*0-8]
+ pmullw xmm1, xm5, [srcq+ssq*0-6]
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1+0]
+ movq xmm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0+0]
+ movhps xmm2, [srcq+ssq*0+2]
+ pmullw xmm1, xm4
+ pmullw xmm2, xm5
+ paddw xmm1, xm6
+ paddw xmm1, xmm2
+ psrlw xmm1, 2 ; 1 2
+ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm7
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xmm0, xm4, [srcq+ssq*0+0]
+ pmullw xmm1, xm5, [srcq+ssq*0+2]
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+ vinserti32x4 ym0, xmm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1+0]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym1, [srcq+ssq*0+0], 1
+ vinserti32x4 ym2, [srcq+ssq*0+2], 1
+ pmullw ym1, ym4
+ pmullw ym2, ym5
+ paddw ym1, ym6
+ paddw ym1, ym2
+ psrlw ym1, 2 ; 1 2
+ vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym7
+ paddw ym1, ym2
+ pmulhrsw ym1, ym8
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ pmullw ym0, ym4, [srcq+ssq*0+0]
+ pmullw ym1, ym5, [srcq+ssq*0+2]
+ paddw ym0, ym6
+ paddw ym0, ym1
+ psrlw ym0, 2
+ vinserti32x8 m0, ym0, 1
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1+0]
+ movu ym2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0+0], 1
+ vinserti32x8 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m6
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vshufi32x4 m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m7
+ paddw m1, m2
+ pmulhrsw m1, m8
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+.hv_w64:
+.hv_w128:
+ movifnidn wd, wm
+ lea r6d, [hq+wq*8-256]
+ mov r4, srcq
+ mov r7, dstq
+.hv_w32_loop0:
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m6
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w32_loop:
+ pmullw m3, m4, [srcq+ssq*1+0]
+ pmullw m1, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m3, m6
+ paddw m3, m1
+ psrlw m3, 2
+ psubw m1, m3, m0
+ paddw m1, m1
+ pmulhw m1, m7
+ paddw m1, m0
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m6
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m3
+ paddw m2, m2
+ pmulhw m2, m7
+ paddw m2, m3
+ pmulhrsw m1, m8
+ pmulhrsw m2, m8
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xmm0, [srcq+strideq*0]
+ movhps xmm0, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+ vpbroadcastq ymm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm0, ymm1, 0x30
+ vpblendd ymm0, ymm2, 0xc0
+ pmullw ymm0, ym4
+ psubw ymm0, ym5
+ mova [tmpq], ymm0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ vzeroupper
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti32x4 ym0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+strideq*0+64*0]
+ pmullw m1, m4, [srcq+strideq*0+64*1]
+ pmullw m2, m4, [srcq+strideq*1+64*0]
+ pmullw m3, m4, [srcq+strideq*1+64*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+64*0]
+ pmullw m1, m4, [srcq+64*1]
+ pmullw m2, m4, [srcq+64*2]
+ pmullw m3, m4, [srcq+64*3]
+ add srcq, strideq
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastd m6, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti32x4 ym1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1, ym2
+ psrldq ym1, 2
+ psrldq ym2, 2
+ pmullw ym0, ym4
+ punpcklqdq ym1, ym2
+ pmullw ym1, ym5
+ psubw ym0, ym6
+ paddw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0+0]
+ movu xm1, [srcq+strideq*0+2]
+ vinserti32x4 ym0, [srcq+strideq*1+0], 1
+ vinserti32x4 ym1, [srcq+strideq*1+2], 1
+ vinserti32x4 m0, [srcq+strideq*2+0], 2
+ vinserti32x4 m1, [srcq+strideq*2+2], 2
+ vinserti32x4 m0, [srcq+stride3q +0], 3
+ vinserti32x4 m1, [srcq+stride3q +2], 3
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8
+ RET
+.h_w16:
+ movu ym0, [srcq+strideq*0+0]
+ vinserti32x8 m0, [srcq+strideq*1+0], 1
+ movu ym1, [srcq+strideq*0+2]
+ vinserti32x8 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ pmullw m1, m4, [srcq+strideq*1+0]
+ pmullw m3, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+ pmullw m3, m5, [srcq+66]
+ add srcq, strideq
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m7, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+ pmullw m8, m5, [srcq+ 66]
+ pmullw m2, m4, [srcq+128]
+ pmullw m9, m5, [srcq+130]
+ pmullw m3, m4, [srcq+192]
+ pmullw m10, m5, [srcq+194]
+ add srcq, strideq
+ REPX {psubw x, m6}, m0, m1, m2, m3
+ paddw m0, m7
+ paddw m1, m8
+ paddw m2, m9
+ paddw m3, m10
+ REPX {psraw x, 2}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ vpbroadcastw m9, mxyd
+ vpbroadcastd m8, [pw_16]
+ vpbroadcastd m10, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m8, m9
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m8, 2
+ psllw m9, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xmm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq xmm2, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+ vpbroadcastq ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm2, ymm1, 0x30
+ vpblendd ymm2, ymm3, 0xc0
+ vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
+ movq xmm0, [srcq+strideq*0]
+ valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
+ pmullw ymm1, ym8
+ pmullw ymm2, ym9
+ psubw ymm1, ym10
+ paddw ymm1, ymm2
+ psraw ymm1, 2
+ mova [tmpq], ymm1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vinserti32x4 m1, [srcq+strideq*2], 2
+ vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ valignq m2, m0, m1, 2 ; 1 2 3 4
+ pmullw m1, m8
+ pmullw m2, m9
+ psubw m1, m10
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu ym0, [srcq+strideq*0]
+.v_w16_loop:
+ vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
+ movu ym3, [srcq+strideq*2]
+ vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ movu ym0, [srcq+strideq*0]
+ vshufi32x4 m3, m1, m3, q1032 ; 1 2
+ vshufi32x4 m4, m2, m0, q1032 ; 3 4
+ pmullw m1, m8
+ pmullw m2, m8
+ pmullw m3, m9
+ pmullw m4, m9
+ psubw m1, m10
+ psubw m2, m10
+ paddw m1, m3
+ paddw m2, m4
+ psraw m1, 2
+ psraw m2, 2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m8, m0
+ movu m0, [srcq+strideq*0]
+ pmullw m2, m8, m3
+ pmullw m3, m9
+ pmullw m4, m9, m0
+ psubw m1, m10
+ psubw m2, m10
+ paddw m1, m3
+ paddw m2, m4
+ psraw m1, 2
+ psraw m2, 2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w64_loop:
+ add srcq, strideq
+ pmullw m2, m8, m0
+ movu m0, [srcq+64*0]
+ pmullw m3, m8, m1
+ movu m1, [srcq+64*1]
+ pmullw m4, m9, m0
+ pmullw m5, m9, m1
+ psubw m2, m10
+ psubw m3, m10
+ paddw m2, m4
+ paddw m3, m5
+ psraw m2, 2
+ psraw m3, 2
+ mova [tmpq+64*0], m2
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+ movu m3, [srcq+64*3]
+.v_w128_loop:
+ add srcq, strideq
+ pmullw m4, m8, m0
+ movu m0, [srcq+64*0]
+ pmullw m5, m8, m1
+ movu m1, [srcq+64*1]
+ pmullw m6, m8, m2
+ movu m2, [srcq+64*2]
+ pmullw m7, m8, m3
+ movu m3, [srcq+64*3]
+ pmullw m11, m9, m0
+ pmullw m12, m9, m1
+ pmullw m13, m9, m2
+ pmullw m14, m9, m3
+ REPX {psubw x, m10}, m4, m5, m6, m7
+ paddw m4, m11
+ paddw m5, m12
+ paddw m6, m13
+ paddw m7, m14
+ REPX {psraw x, 2}, m4, m5, m6, m7
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m5
+ mova [tmpq+64*2], m6
+ mova [tmpq+64*3], m7
+ add tmpq, 64*4
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m7, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ movq xmm0, [srcq+strideq*0+0]
+ movq xmm1, [srcq+strideq*0+2]
+ pmullw xmm0, xm4
+ pmullw xmm1, xm5
+ psubw xmm0, xm6
+ paddw xmm0, xmm1
+ psraw xmm0, 2
+ vpbroadcastq ym0, xmm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 ym1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym2, [srcq+strideq*0], 1
+ punpcklqdq ym3, ym1, ym2
+ psrldq ym1, 2
+ psrldq ym2, 2
+ pmullw ym3, ym4
+ punpcklqdq ym1, ym2
+ pmullw ym1, ym5
+ psubw ym3, ym6
+ paddw ym1, ym3
+ psraw ym1, 2 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym7
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0+0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm6
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti32x4 m0, xm0, 3
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1+0]
+ movu xm2, [srcq+strideq*1+2]
+ vinserti32x4 ym1, [srcq+strideq*2+0], 1
+ vinserti32x4 ym2, [srcq+strideq*2+2], 1
+ vinserti32x4 m1, [srcq+stride3q +0], 2
+ vinserti32x4 m2, [srcq+stride3q +2], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 m1, [srcq+strideq*0+0], 3
+ vinserti32x4 m2, [srcq+strideq*0+2], 3
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m6
+ paddw m1, m2
+ psraw m1, 2 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ pmullw ym0, ym4, [srcq+strideq*0+0]
+ pmullw ym1, ym5, [srcq+strideq*0+2]
+ psubw ym0, ym6
+ paddw ym0, ym1
+ psraw ym0, 2
+ vinserti32x8 m0, ym0, 1
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1+0]
+ movu ym2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+strideq*0+0], 1
+ vinserti32x8 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m6
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vshufi32x4 m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+.hv_w32_loop:
+ pmullw m3, m4, [srcq+strideq*1+0]
+ pmullw m1, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m3, m6
+ paddw m3, m1
+ psraw m3, 2
+ psubw m1, m3, m0
+ pmulhrsw m1, m7
+ paddw m1, m0
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m3
+ pmulhrsw m2, m7
+ paddw m2, m3
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+ pmullw m3, m5, [srcq+66]
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+.hv_w64_loop:
+ add srcq, strideq
+ pmullw m2, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m3, m4, [srcq+64]
+ pmullw m9, m5, [srcq+66]
+ psubw m2, m6
+ psubw m3, m6
+ paddw m2, m8
+ paddw m3, m9
+ psraw m2, 2
+ psraw m3, 2
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ pmulhrsw m8, m7
+ pmulhrsw m9, m7
+ paddw m8, m0
+ mova m0, m2
+ paddw m9, m1
+ mova m1, m3
+ mova [tmpq+64*0], m8
+ mova [tmpq+64*1], m9
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+ pmullw m9, m5, [srcq+ 66]
+ pmullw m2, m4, [srcq+128]
+ pmullw m10, m5, [srcq+130]
+ pmullw m3, m4, [srcq+192]
+ pmullw m11, m5, [srcq+194]
+ REPX {psubw x, m6}, m0, m1, m2, m3
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ REPX {psraw x, 2}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ pmullw m8, m4, [srcq+ 0]
+ pmullw m12, m5, [srcq+ 2]
+ pmullw m9, m4, [srcq+ 64]
+ pmullw m13, m5, [srcq+ 66]
+ pmullw m10, m4, [srcq+128]
+ pmullw m14, m5, [srcq+130]
+ pmullw m11, m4, [srcq+192]
+ pmullw m15, m5, [srcq+194]
+ REPX {psubw x, m6}, m8, m9, m10, m11
+ paddw m8, m12
+ paddw m9, m13
+ paddw m10, m14
+ paddw m11, m15
+ REPX {psraw x, 2}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ REPX {pmulhrsw x, m7}, m12, m13, m14, m15
+ paddw m12, m0
+ mova m0, m8
+ paddw m13, m1
+ mova m1, m9
+ mova [tmpq+64*0], m12
+ mova [tmpq+64*1], m13
+ paddw m14, m2
+ mova m2, m10
+ paddw m15, m3
+ mova m3, m11
+ mova [tmpq+64*2], m14
+ mova [tmpq+64*3], m15
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
+cglobal %1_8tap_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%define buf rsp+stack_offset+8 ; shadow space
+%else
+DECLARE_REG_TMP 7, 8
+%define buf rsp-40 ; red zone
+%endif
+
+MC_8TAP_FN put, sharp, SHARP, SHARP
+MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN put, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN put, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN put, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN put, regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova ym2, [spel_h_shuf2a]
+ pmovsxbw xmm4, [base+subpel_filters+mxq*8]
+ pshufd xmm3, xmm4, q1111
+ pshufd xmm4, xmm4, q2222
+.h_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti32x4 ym1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova xmm0, xm8
+ vpermb ym1, ym2, ym1
+ vpdpwssd xmm0, xmm3, xm1
+ vextracti32x4 xm1, ym1, 1
+ vpdpwssd xmm0, xmm4, xm1
+ psrad xmm0, 6
+ packusdw xmm0, xmm0
+ pminsw xmm0, xm9
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ vbroadcasti32x4 ym4, [spel_h_shufA]
+ vbroadcasti32x4 ym5, [spel_h_shufB]
+ pshufd xmm0, xmm0, q2211
+ vpbroadcastq ym6, xmm0
+ vpermq ym7, ymm0, q1111
+.h_w4_loop:
+ movu xm2, [srcq+ssq*0]
+ vinserti32x4 ym2, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova ym0, ym8
+ pshufb ym1, ym2, ym4
+ vpdpwssd ym0, ym6, ym1
+ pshufb ym2, ym5
+ vpdpwssd ym0, ym7, ym2
+ psrad ym0, 6
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1
+ pminsw xmm0, xm0, xm9
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m9, r8m
+ shr r7d, 11
+ vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ shr mxd, 16
+ sub srcq, 6
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mova [buf], xmm0
+ vpbroadcastd m10, xmm0
+ vpbroadcastd m11, [buf+ 4]
+ vpbroadcastd m12, [buf+ 8]
+ vpbroadcastd m13, [buf+12]
+ sub wd, 16
+ je .h_w16
+ jg .h_w32
+.h_w8:
+ mova m4, [spel_h_shufA]
+ movu m5, [spel_h_shufB]
+ movu m6, [spel_h_shufC]
+ mova m7, [spel_h_shufD]
+.h_w8_loop:
+ movu ym2, [srcq+ssq*0]
+ vinserti32x8 m2, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova m0, m8
+ vpermb m1, m4, m2
+ vpdpwssd m0, m10, m1
+ vpermb m1, m5, m2
+ vpdpwssd m0, m11, m1
+ vpermb m1, m6, m2
+ vpdpwssd m0, m12, m1
+ vpermb m1, m7, m2
+ vpdpwssd m0, m13, m1
+ psrad m0, 6
+ vextracti32x8 ym1, m0, 1
+ packusdw ym0, ym1
+ pminsw ym0, ym9
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8_loop
+ RET
+.h_w16:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+.h_w16_loop:
+ movu ym2, [srcq+ssq*0+ 0]
+ vinserti32x8 m2, [srcq+ssq*1+ 0], 1
+ movu ym3, [srcq+ssq*0+16]
+ vinserti32x8 m3, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ mova m0, m8
+ mova m1, m8
+ pshufb m4, m2, m6
+ vpdpwssd m0, m10, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m12, m4 ; b2
+ pshufb m4, m2, m7
+ vpdpwssd m0, m11, m4 ; a1
+ pshufb m4, m3, m7
+ vpdpwssd m1, m13, m4 ; b3
+ shufpd m2, m3, 0x55
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a2
+ vpdpwssd m1, m10, m4 ; b0
+ pshufb m2, m7
+ vpdpwssd m0, m13, m2 ; a3
+ vpdpwssd m1, m11, m2 ; b1
+ psrad m0, 6
+ psrad m1, 6
+ packusdw m0, m1
+ pminsw m0, m9
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea dstq, [dstq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ neg wq
+.h_w32_loop0:
+ mov r6, wq
+.h_w32_loop:
+ movu m2, [srcq+r6*2+ 0]
+ movu m3, [srcq+r6*2+ 8]
+ mova m0, m8
+ mova m1, m8
+ pshufb m4, m2, m6
+ vpdpwssd m0, m10, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m10, m4 ; b0
+ vpdpwssd m0, m12, m4 ; a2
+ movu m4, [srcq+r6*2+16]
+ pshufb m3, m7
+ vpdpwssd m1, m11, m3 ; b1
+ vpdpwssd m0, m13, m3 ; a3
+ pshufb m3, m4, m6
+ vpdpwssd m1, m12, m3 ; b2
+ pshufb m2, m7
+ vpdpwssd m0, m11, m2 ; a1
+ pshufb m4, m7
+ vpdpwssd m1, m13, m4 ; b3
+ psrad m0, 6
+ psrad m1, 6
+ packusdw m0, m1
+ pminsw m0, m9
+ mova [dstq+r6*2], m0
+ add r6, 32
+ jl .h_w32_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m10, [pd_32]
+ pmovsxbw xmm0, [base+subpel_filters+myq*8]
+ tzcnt r7d, wd
+ vpbroadcastw m11, r8m
+ lea r6, [ssq*3]
+ movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
+ sub srcq, r6
+ mova [rsp+stack_offset+8], xmm0
+ vpbroadcastd m12, xmm0
+ add r7, r8
+ vpbroadcastd m13, [rsp+stack_offset+12]
+ vpbroadcastd m14, [rsp+stack_offset+16]
+ vpbroadcastd m15, [rsp+stack_offset+20]
+ jmp r7
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, r6
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, r6
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklwd xmm3, xmm1 ; 45 56
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xmm5, xm10
+ vpdpwssd xmm5, xm12, xmm1 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xm13, xmm2 ; a1 b1
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xm14, xmm3 ; a2 b2
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 7 8
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xm15, xmm3 ; a3 b3
+ psrad xmm5, 6
+ packusdw xmm5, xmm5
+ pminsw xmm5, xm11
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, r6
+ vpbroadcastq ymm4, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm5, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklwd ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm3, 0x30
+ punpcklwd ymm2, ymm4 ; 23 34
+ vpblendd ymm3, ymm5, 0x30
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq ymm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova ymm4, ym10
+ vpdpwssd ymm4, ym12, ymm1 ; a0 b0
+ mova ymm1, ymm2
+ vpdpwssd ymm4, ym13, ymm2 ; a1 b1
+ mova ymm2, ymm3
+ vpdpwssd ymm4, ym14, ymm3 ; a2 b2
+ vpblendd ymm3, ymm0, ymm5, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 67 78
+ vpdpwssd ymm4, ym15, ymm3 ; a3 b3
+ psrad ymm4, 6
+ vextracti128 xmm5, ymm4, 1
+ packusdw xmm4, xmm5
+ pminsw xmm4, xm11
+ movq [dstq+dsq*0], xmm4
+ movhps [dstq+dsq*1], xmm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ vinserti32x4 m1, m2, [srcq+ssq*0], 0
+ vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
+ add srcq, r6
+ vinserti32x4 ym2, [srcq+ssq*0], 1
+ vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4
+ mova m6, [spel_v_shuf8]
+ movu xm0, [srcq+ssq*1]
+ vinserti32x4 ym0, [srcq+ssq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
+ vpermb m1, m6, m1 ; 01 12
+ vpermb m2, m6, m2 ; 23 34
+ vpermb m3, m6, m0 ; 45 56
+.v_w8_loop:
+ vinserti32x4 m0, [srcq+ssq*1], 3
+ lea srcq, [srcq+ssq*2]
+ movu xm5, [srcq+ssq*0]
+ mova m4, m10
+ vpdpwssd m4, m12, m1 ; a0 b0
+ mova m1, m2
+ vshufi32x4 m0, m5, q1032 ; 6 7 8
+ vpdpwssd m4, m13, m2 ; a1 b1
+ mova m2, m3
+ vpdpwssd m4, m14, m3 ; a2 b2
+ vpermb m3, m6, m0 ; 67 78
+ vpdpwssd m4, m15, m3 ; a3 b3
+ psrad m4, 6
+ vextracti32x8 ym5, m4, 1
+ packusdw ym4, ym5
+ pminsw ym4, ym11
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti32x8 m1, [srcq+ssq*1]
+ vinserti32x8 m0, m1, [srcq+ssq*0], 0
+ vinserti32x8 m1, [srcq+ssq*2], 1
+ mova m8, [spel_v_shuf16]
+ add srcq, r6
+ movu ym3, [srcq+ssq*0]
+ vinserti32x8 m3, [srcq+ssq*1], 1
+ movu ym5, [srcq+ssq*2]
+ add srcq, r6
+ vinserti32x8 m5, [srcq+ssq*0], 1
+ vpermb m0, m8, m0 ; 01
+ vpermb m1, m8, m1 ; 12
+ vpermb m3, m8, m3 ; 34
+ vpermb m5, m8, m5 ; 56
+ mova m9, [deint_q_shuf]
+ vpshrdd m2, m1, m3, 16 ; 23
+ vpshrdd m4, m3, m5, 16 ; 45
+.v_w16_loop:
+ mova m6, m10
+ mova m7, m10
+ vpdpwssd m6, m12, m0 ; a0
+ mova m0, m2
+ vpdpwssd m7, m12, m1 ; b0
+ mova m1, m3
+ vpdpwssd m6, m13, m2 ; a1
+ mova m2, m4
+ vpdpwssd m7, m13, m3 ; b1
+ mova m3, m5
+ vpdpwssd m6, m14, m4 ; a2
+ mova m4, m5
+ vpdpwssd m7, m14, m5 ; b2
+ movu ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m5, [srcq+ssq*0], 1
+ vpermb m5, m8, m5 ; 78
+ vpshrdd m4, m5, 16 ; 67
+ vpdpwssd m6, m15, m4 ; a3
+ vpdpwssd m7, m15, m5 ; b3
+ psrad m6, 6
+ psrad m7, 6
+ packusdw m6, m7
+ pminsw m6, m11
+ vpermq m6, m9, m6
+ mova [dstq+dsq*0], ym6
+ vextracti32x8 [dstq+dsq*1], m6, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ movaps [rsp+stack_offset+8], xmm6
+%endif
+ lea wd, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, dstq
+.v_w32_loop0:
+ movu m16, [srcq+ssq*0]
+ movu m17, [srcq+ssq*1]
+ movu m18, [srcq+ssq*2]
+ add srcq, r6
+ movu m19, [srcq+ssq*0]
+ movu m20, [srcq+ssq*1]
+ movu m21, [srcq+ssq*2]
+ add srcq, r6
+ movu m22, [srcq+ssq*0]
+ punpcklwd m0, m16, m17 ; 01l
+ punpckhwd m16, m17 ; 01h
+ punpcklwd m1, m17, m18 ; 12l
+ punpckhwd m17, m18 ; 12h
+ punpcklwd m2, m18, m19 ; 23l
+ punpckhwd m18, m19 ; 23h
+ punpcklwd m3, m19, m20 ; 34l
+ punpckhwd m19, m20 ; 34h
+ punpcklwd m4, m20, m21 ; 45l
+ punpckhwd m20, m21 ; 45h
+ punpcklwd m5, m21, m22 ; 56l
+ punpckhwd m21, m22 ; 56h
+.v_w32_loop:
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; a0l
+ mova m8, m10
+ vpdpwssd m8, m12, m16 ; a0h
+ mova m7, m10
+ vpdpwssd m7, m12, m1 ; b0l
+ mova m9, m10
+ vpdpwssd m9, m12, m17 ; b0h
+ mova m0, m2
+ vpdpwssd m6, m13, m2 ; a1l
+ mova m16, m18
+ vpdpwssd m8, m13, m18 ; a1h
+ mova m1, m3
+ vpdpwssd m7, m13, m3 ; b1l
+ mova m17, m19
+ vpdpwssd m9, m13, m19 ; b1h
+ mova m2, m4
+ vpdpwssd m6, m14, m4 ; a2l
+ mova m18, m20
+ vpdpwssd m8, m14, m20 ; a2h
+ mova m3, m5
+ vpdpwssd m7, m14, m5 ; b2l
+ mova m19, m21
+ vpdpwssd m9, m14, m21 ; b2h
+ movu m21, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m4, m22, m21 ; 67l
+ punpckhwd m20, m22, m21 ; 67h
+ movu m22, [srcq+ssq*0]
+ vpdpwssd m6, m15, m4 ; a3l
+ vpdpwssd m8, m15, m20 ; a3h
+ punpcklwd m5, m21, m22 ; 78l
+ punpckhwd m21, m22 ; 78h
+ vpdpwssd m7, m15, m5 ; b3l
+ vpdpwssd m9, m15, m21 ; b3h
+ REPX {psrad x, 6}, m6, m8, m7, m9
+ packusdw m6, m8
+ packusdw m7, m9
+ pminsw m6, m11
+ pminsw m7, m11
+ mova [dstq+dsq*0], m6
+ mova [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ movaps xmm6, [rsp+stack_offset+8]
+%endif
+ vzeroupper
+ RET
+.hv:
+ vpbroadcastw m11, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ test dword r8m, 0x800
+ jnz .hv_12bit
+ vpbroadcastd m10, [pd_2176]
+ psllw xmm0, 6
+ jmp .hv_main
+.hv_12bit:
+ vpbroadcastd m10, [pd_640]
+ psllw xmm0, 4
+ psllw xmm1, 2
+.hv_main:
+ mova [buf+ 0], xmm0
+ mova [buf+16], xmm1
+ vpbroadcastd m8, [buf+ 4]
+ vpbroadcastd m9, [buf+ 8]
+ vpbroadcastd ym12, xmm1
+ vpbroadcastd ym13, [buf+20]
+ vpbroadcastd ym14, [buf+24]
+ vpbroadcastd ym15, [buf+28]
+ movu xm4, [srcq+ssq*0]
+ vinserti32x4 ym4, [srcq+ssq*1], 1
+ vinserti32x4 m4, [srcq+ssq*2], 2
+ add srcq, r6
+ vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3
+ movu xm0, [srcq+ssq*1]
+ vinserti32x4 ym0, [srcq+ssq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti32x4 m2, [spel_h_shufA]
+ mova m3, [spel_h_shuf2b]
+ mova ym6, [spel_h_shuf2a]
+ mova xm7, [spel_shuf2]
+ mova m1, m10
+ pshufb m4, m2
+ pshufb m0, m2
+ punpcklqdq m2, m4, m0
+ vpdpwssd m1, m8, m2 ; 04 15 26 3_
+ punpckhqdq m4, m0
+ vpdpwssd m1, m9, m4
+ vpermb m1, m3, m1 ; 01 12
+ vextracti32x4 xm2, ym1, 1 ; 23 34
+ vextracti32x4 xm3, m1, 2 ; 45 56
+.hv_w2_loop:
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ mova xm4, xm10
+ vpermb ym5, ym6, ym5
+ pmaddwd xmm0, xm12, xm1 ; a0 b0
+ vpdpwssd xm4, xm8, xm5
+ vextracti32x4 xm5, ym5, 1
+ mova xm1, xm2
+ vpdpwssd xmm0, xm13, xm2 ; a1 b1
+ vpdpwssd xm4, xm9, xm5 ; 7 8
+ mova xm2, xm3
+ vpdpwssd xmm0, xm14, xm3 ; a2 b2
+ vpermt2b xm3, xm7, xm4 ; 67 78
+ vpdpwssd xmm0, xm15, xm3 ; a3 b3
+ psrad xmm0, 10
+ packusdw xmm0, xmm0
+ pminsw xmm0, xm11
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti32x4 m19, [spel_h_shufA]
+ vbroadcasti32x4 m20, [spel_h_shufB]
+ mova ym6, [spel_shuf4a]
+ mova ym7, [spel_shuf4b]
+ mova m2, m10
+ mova m3, m10
+ pshufb m1, m4, m19
+ vpdpwssd m2, m8, m1
+ pshufb m1, m0, m19
+ vpdpwssd m3, m8, m1
+ pshufb m4, m20
+ vpdpwssd m2, m9, m4
+ pshufb m0, m20
+ vpdpwssd m3, m9, m0
+ vpermb m1, m6, m2 ; 01 12
+ vshufi32x4 m2, m3, q1032
+ vpermb m3, m6, m3 ; 45 56
+ vpermb m2, m6, m2 ; 23 34
+.hv_w4_loop:
+ movu xm18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym18, [srcq+ssq*0], 1
+ mova ym4, ym10
+ pshufb ym17, ym18, ym19
+ pmaddwd ym16, ym12, ym1 ; a0 b0
+ vpdpwssd ym4, ym8, ym17
+ pshufb ym18, ym20
+ mova ym1, ym2
+ vpdpwssd ym16, ym13, ym2 ; a1 b1
+ vpdpwssd ym4, ym9, ym18 ; 7 8
+ mova ym2, ym3
+ vpdpwssd ym16, ym14, ym3 ; a2 b2
+ vpermt2b ym3, ym7, ym4 ; 67 78
+ vpdpwssd ym16, ym15, ym3 ; a3 b3
+ psrad ym16, 10
+ vextracti128 xm17, ym16, 1
+ packusdw xm16, xm17
+ pminsw xm16, xm11
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ test dword r8m, 0x800
+ jnz .hv_w8_12bit
+ vpbroadcastd m10, [pd_2176]
+ psllw xmm0, 6
+ jmp .hv_w8_main
+.hv_w8_12bit:
+ vpbroadcastd m10, [pd_640]
+ psllw xmm0, 4
+ psllw xmm1, 2
+.hv_w8_main:
+ mova [buf+ 0], xmm0
+ mova [buf+16], xmm1
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [buf+ 4]
+ vpbroadcastd m14, [buf+ 8]
+ vpbroadcastd m15, [buf+12]
+ vpbroadcastd m16, xmm1
+ vpbroadcastd m17, [buf+20]
+ vpbroadcastd m18, [buf+24]
+ vpbroadcastd m19, [buf+28]
+ cmp wd, 16
+ je .hv_w16
+ jg .hv_w32
+ mova m5, [spel_h_shufA]
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1
+ movu ym9, [srcq+ssq*2]
+ add srcq, r6
+ vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3
+ movu ym20, [srcq+ssq*1]
+ vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5
+ add srcq, r6
+ movu ym21, [srcq+ssq*0] ; 6
+ movu m6, [spel_h_shufB]
+ movu m7, [spel_h_shufC]
+ vpermb m8, m5, m0
+ mova m1, m10
+ vpdpwssd m1, m12, m8 ; a0 b0
+ vpermb m8, m5, m9
+ mova m2, m10
+ vpdpwssd m2, m12, m8 ; c0 d0
+ vpermb m8, m5, m20
+ mova m3, m10
+ vpdpwssd m3, m12, m8 ; e0 f0
+ vpermb m8, m5, m21
+ mova m4, m10
+ vpdpwssd m4, m12, m8 ; g0
+ vpermb m8, m6, m0
+ vpdpwssd m1, m13, m8 ; a1 b1
+ vpermb m8, m6, m9
+ vpdpwssd m2, m13, m8 ; c1 d1
+ vpermb m8, m6, m20
+ vpdpwssd m3, m13, m8 ; e1 f1
+ vpermb m8, m6, m21
+ vpdpwssd m4, m13, m8 ; g1
+ vpermb m8, m7, m0
+ vpdpwssd m1, m14, m8 ; a2 b2
+ vpermb m8, m7, m9
+ vpdpwssd m2, m14, m8 ; c2 d2
+ vpermb m8, m7, m20
+ vpdpwssd m3, m14, m8 ; e2 f2
+ vpermb m8, m7, m21
+ vpdpwssd m4, m14, m8 ; g2
+ mova m8, [spel_h_shufD]
+ vpermb m0, m8, m0
+ vpdpwssd m1, m15, m0 ; a3 b3
+ mova m0, [spel_shuf8a]
+ vpermb m9, m8, m9
+ vpdpwssd m2, m15, m9 ; c3 d3
+ mova m9, [spel_shuf8b]
+ vpermb m20, m8, m20
+ vpdpwssd m3, m15, m20 ; e3 f3
+ vpermb m21, m8, m21
+ vpdpwssd m4, m15, m21 ; g3
+ vpermt2b m1, m0, m2 ; 01 12
+ vpermt2b m2, m0, m3 ; 23 34
+ vpermt2b m3, m0, m4 ; 45 56
+.hv_w8_loop:
+ movu ym0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ mova m4, m10
+ vpermb m21, m5, m0
+ vpdpwssd m4, m12, m21 ; h0 i0
+ vpermb m21, m6, m0
+ pmaddwd m20, m16, m1 ; A0 B0
+ vpdpwssd m4, m13, m21 ; h1 i1
+ vpermb m21, m7, m0
+ mova m1, m2
+ vpdpwssd m20, m17, m2 ; A1 B1
+ vpdpwssd m4, m14, m21 ; h2 i2
+ vpermb m21, m8, m0
+ mova m2, m3
+ vpdpwssd m20, m18, m3 ; A2 B2
+ vpdpwssd m4, m15, m21 ; h3 i3
+ vpermt2b m3, m9, m4 ; 67 78
+ vpdpwssd m20, m19, m3 ; A3 B3
+ psrad m20, 10
+ vextracti32x8 ym21, m20, 1
+ packusdw ym20, ym21
+ pminsw ym20, ym11
+ mova [dstq+dsq*0], xm20
+ vextracti128 [dstq+dsq*1], ym20, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ WIN64_SPILL_XMM 26
+ vbroadcasti32x8 m5, [srcq+ssq*0+ 8]
+ vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0
+ vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0
+ movu ym6, [srcq+ssq*1+ 0]
+ movu ym7, [srcq+ssq*1+16]
+ vinserti32x8 m6, [srcq+ssq*2+ 0], 1
+ vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2
+ add srcq, r6
+ movu ym22, [srcq+ssq*0+ 0]
+ movu ym23, [srcq+ssq*0+16]
+ vinserti32x8 m22, [srcq+ssq*1+ 0], 1
+ vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4
+ movu ym24, [srcq+ssq*2+ 0]
+ movu ym25, [srcq+ssq*2+16]
+ add srcq, r6
+ vinserti32x8 m24, [srcq+ssq*0+ 0], 1
+ vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m9, [spel_shuf16]
+ pshufb m0, m4, m20
+ mova m1, m10
+ vpdpwssd m1, m12, m0 ; a0
+ pshufb m0, m6, m20
+ mova m2, m10
+ vpdpwssd m2, m12, m0 ; b0
+ pshufb m0, m7, m20
+ mova m3, m10
+ vpdpwssd m3, m14, m0 ; c2
+ pshufb m0, m4, m21
+ vpdpwssd m1, m13, m0 ; a1
+ pshufb m0, m6, m21
+ vpdpwssd m2, m13, m0 ; b1
+ pshufb m0, m7, m21
+ vpdpwssd m3, m15, m0 ; c3
+ pshufb m0, m5, m20
+ vpdpwssd m1, m14, m0 ; a2
+ shufpd m6, m7, 0x55
+ pshufb m7, m6, m20
+ vpdpwssd m2, m14, m7 ; b2
+ vpdpwssd m3, m12, m7 ; c0
+ pshufb m5, m21
+ vpdpwssd m1, m15, m5 ; a3
+ pshufb m6, m21
+ vpdpwssd m2, m15, m6 ; b3
+ vpdpwssd m3, m13, m6 ; c1
+ pshufb m0, m22, m20
+ mova m4, m10
+ vpdpwssd m4, m12, m0 ; d0
+ pshufb m0, m23, m20
+ mova m5, m10
+ vpdpwssd m5, m14, m0 ; e2
+ pshufb m0, m24, m20
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; f0
+ pshufb m0, m25, m20
+ mova m7, m10
+ vpdpwssd m7, m14, m0 ; g2
+ pshufb m0, m22, m21
+ vpdpwssd m4, m13, m0 ; d1
+ pshufb m0, m23, m21
+ vpdpwssd m5, m15, m0 ; e3
+ pshufb m0, m24, m21
+ vpdpwssd m6, m13, m0 ; f1
+ pshufb m0, m25, m21
+ vpdpwssd m7, m15, m0 ; g3
+ shufpd m22, m23, 0x55
+ pshufb m23, m22, m20
+ vpdpwssd m4, m14, m23 ; d2
+ vpdpwssd m5, m12, m23 ; e0
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m6, m14, m25 ; f2
+ vpdpwssd m7, m12, m25 ; g0
+ pshufb m22, m21
+ vpdpwssd m4, m15, m22 ; d3
+ vpdpwssd m5, m13, m22 ; e1
+ pshufb m24, m21
+ vpdpwssd m6, m15, m24 ; f3
+ vpdpwssd m7, m13, m24 ; g1
+ pslldq m1, 1
+ vpermt2b m2, m9, m3 ; 12
+ vpermt2b m4, m9, m5 ; 34
+ vpermt2b m6, m9, m7 ; 56
+ vpshrdd m1, m2, 16 ; 01
+ vpshrdd m3, m2, m4, 16 ; 23
+ vpshrdd m5, m4, m6, 16 ; 45
+.hv_w16_loop:
+ movu ym24, [srcq+ssq*1+ 0]
+ movu ym25, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m24, [srcq+ssq*0+ 0], 1
+ vinserti32x8 m25, [srcq+ssq*0+16], 1
+ mova m7, m10
+ mova m8, m10
+ pshufb m0, m24, m20
+ vpdpwssd m7, m12, m0 ; h0
+ pshufb m0, m25, m20
+ vpdpwssd m8, m14, m0 ; i2
+ pmaddwd m22, m16, m1 ; A0
+ mova m1, m3
+ pmaddwd m23, m16, m2 ; B0
+ mova m2, m4
+ pshufb m0, m24, m21
+ vpdpwssd m7, m13, m0 ; h1
+ pshufb m0, m25, m21
+ vpdpwssd m8, m15, m0 ; i3
+ vpdpwssd m22, m17, m3 ; A1
+ mova m3, m5
+ vpdpwssd m23, m17, m4 ; B1
+ mova m4, m6
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m7, m14, m25 ; h2
+ vpdpwssd m8, m12, m25 ; i0
+ vpdpwssd m22, m18, m5 ; A2
+ vpdpwssd m23, m18, m6 ; B2
+ pshufb m24, m21
+ vpdpwssd m7, m15, m24 ; h3
+ vpdpwssd m8, m13, m24 ; i1
+ vpermt2b m7, m9, m8 ; 78
+ vpshrdd m5, m6, m7, 16 ; 67
+ vpdpwssd m22, m19, m5 ; A3
+ vpdpwssd m23, m19, m7 ; B3
+ mova m6, m7
+ psrad m22, 10
+ psrad m23, 10
+ vshufi32x4 m0, m22, m23, q3232
+ vinserti32x8 m22, ym23, 1
+ packusdw m22, m0
+ pminsw m22, m11
+ mova [dstq+dsq*0], ym22
+ vextracti32x8 [dstq+dsq*1], m22, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 32
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m22, [spel_shuf32]
+ lea wd, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, dstq
+.hv_w32_loop0:
+ movu m6, [srcq+ssq*0+ 0]
+ movu m7, [srcq+ssq*0+ 8]
+ movu m8, [srcq+ssq*0+16]
+ mova m0, m10
+ mova m23, m10
+ pshufb m9, m6, m20
+ vpdpwssd m0, m12, m9 ; a0l
+ pshufb m9, m7, m20
+ vpdpwssd m23, m12, m9 ; a0h
+ vpdpwssd m0, m14, m9 ; a2l
+ pshufb m7, m21
+ vpdpwssd m23, m13, m7 ; a1h
+ vpdpwssd m0, m15, m7 ; a3l
+ pshufb m7, m8, m20
+ vpdpwssd m23, m14, m7 ; a2h
+ pshufb m6, m21
+ vpdpwssd m0, m13, m6 ; a1l
+ pshufb m8, m21
+ vpdpwssd m23, m15, m8 ; a3h
+%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2]
+ movu m6, [srcq+%3*%4+ 0]
+ movu m7, [srcq+%3*%4+ 8]
+ movu m8, [srcq+%3*%4+16]
+%if %4 == 2
+ add srcq, r6
+%endif
+ movu m29, [srcq+%3*%5+ 0]
+ movu m30, [srcq+%3*%5+ 8]
+ movu m31, [srcq+%3*%5+16]
+%if %5 == 2
+ add srcq, r6
+%endif
+ mova m%1, m10
+ mova m9, m10
+ pshufb m%2, m6, m20
+ vpdpwssd m%1, m12, m%2 ; x0l
+ pshufb m%2, m29, m20
+ vpdpwssd m9, m12, m%2 ; y0l
+ pshufb m6, m21
+ vpdpwssd m%1, m13, m6 ; x1l
+ pshufb m29, m21
+ vpdpwssd m9, m13, m29 ; y1l
+ pshufb m6, m7, m20
+ mova m%2, m10
+ vpdpwssd m%2, m12, m6 ; x0h
+ pshufb m29, m30, m20
+ vpdpwssd m%1, m14, m6 ; y2l
+ mova m6, m10
+ vpdpwssd m6, m12, m29 ; x0h
+ pshufb m7, m21
+ vpdpwssd m9, m14, m29 ; y2l
+ pshufb m30, m21
+ vpdpwssd m%2, m13, m7 ; x1h
+ vpdpwssd m%1, m15, m7 ; x3l
+ pshufb m7, m8, m20
+ vpdpwssd m6, m13, m30 ; y1h
+ vpdpwssd m9, m15, m30 ; y3l
+ pshufb m30, m31, m20
+ vpdpwssd m%2, m14, m7 ; x2h
+ pshufb m8, m21
+ vpdpwssd m6, m14, m30 ; y2h
+ pshufb m31, m21
+ vpdpwssd m%2, m15, m8 ; x3h
+ vpdpwssd m6, m15, m31 ; y3h
+%if %1 == 1
+ vpermt2b m0, m22, m%1 ; 01l
+ vpermt2b m23, m22, m%2 ; 01h
+%endif
+ vpermt2b m%1, m22, m9 ; xyl
+ vpermt2b m%2, m22, m6 ; xyh
+%endmacro
+ PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12
+ PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34
+ PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23l
+ vpshrdd m25, m24, m26, 16 ; 23h
+ vpshrdd m4, m3, m5, 16 ; 45l
+ vpshrdd m27, m26, m28, 16 ; 45h
+.hv_w32_loop:
+ movu m7, [srcq+ssq*1+ 0]
+ movu m9, [srcq+ssq*2+ 0]
+ movu m6, [srcq+ssq*1+ 8]
+ movu m8, [srcq+ssq*2+ 8]
+ mova m29, m10
+ mova m31, m10
+ pshufb m30, m7, m20
+ vpdpwssd m29, m12, m30 ; h0l
+ pshufb m30, m9, m20
+ vpdpwssd m31, m12, m30 ; i0l
+ pshufb m7, m21
+ vpdpwssd m29, m13, m7 ; h1l
+ pshufb m9, m21
+ vpdpwssd m31, m13, m9 ; i1l
+ pshufb m7, m6, m20
+ vpdpwssd m29, m14, m7 ; h2l
+ pshufb m9, m8, m20
+ vpdpwssd m31, m14, m9 ; i2l
+ pshufb m6, m21
+ vpdpwssd m29, m15, m6 ; h3l
+ pshufb m8, m21
+ vpdpwssd m31, m15, m8 ; i3l
+ mova m30, m10
+ vpdpwssd m30, m12, m7 ; h0h
+ movu m7, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*2]
+ vpermt2b m29, m22, m31 ; 78l
+ mova m31, m10
+ vpdpwssd m31, m12, m9 ; i0h
+ movu m9, [srcq+ssq*0+16]
+ vpdpwssd m30, m13, m6 ; h1h
+ pshufb m6, m7, m20
+ vpdpwssd m31, m13, m8 ; i1h
+ pshufb m8, m9, m20
+ vpdpwssd m30, m14, m6 ; h2h
+ pmaddwd m6, m16, m0 ; A0l
+ pshufb m7, m21
+ vpdpwssd m31, m14, m8 ; i2h
+ pmaddwd m8, m16, m23 ; A0h
+ pshufb m9, m21
+ vpdpwssd m30, m15, m7 ; h3h
+ pmaddwd m7, m16, m1 ; B0l
+ vpdpwssd m31, m15, m9 ; i3h
+ pmaddwd m9, m16, m24 ; B0h
+ mova m0, m2
+ vpdpwssd m6, m17, m2 ; A1l
+ mova m23, m25
+ vpdpwssd m8, m17, m25 ; A1h
+ mova m1, m3
+ vpdpwssd m7, m17, m3 ; B1l
+ mova m24, m26
+ vpdpwssd m9, m17, m26 ; B1h
+ vpermt2b m30, m22, m31 ; 78h
+ vpdpwssd m6, m18, m4 ; A2l
+ mova m2, m4
+ vpdpwssd m8, m18, m27 ; A2h
+ mova m25, m27
+ vpdpwssd m7, m18, m5 ; B2l
+ mova m3, m5
+ vpdpwssd m9, m18, m28 ; B2h
+ mova m26, m28
+ vpshrdd m4, m5, m29, 16 ; 67l
+ vpdpwssd m6, m19, m4 ; A3l
+ vpshrdd m27, m28, m30, 16 ; 67h
+ vpdpwssd m8, m19, m27 ; A3h
+ mova m5, m29
+ vpdpwssd m7, m19, m29 ; B3l
+ mova m28, m30
+ vpdpwssd m9, m19, m30 ; B3h
+ REPX {psrad x, 10}, m6, m8, m7, m9
+ packusdw m6, m8
+ packusdw m7, m9
+ pminsw m6, m11
+ pminsw m7, m11
+ mova [dstq+dsq*0], m6
+ mova [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+MC_8TAP_FN prep, sharp, SHARP, SHARP
+MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN prep, regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
+%define base r7-prep_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ mov wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [pw_8192]
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ shr r5d, 11
+ vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mov r5d, r7m
+ vbroadcasti32x4 m4, [spel_h_shufA]
+ vbroadcasti32x4 m5, [spel_h_shufB]
+ shr r5d, 11
+ mova ym9, [prep_endA]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ mova [tmpq], xmm0
+ vpbroadcastd m6, [tmpq+4]
+ vpbroadcastd m7, [tmpq+8]
+.h_w4_loop:
+ movu xm2, [srcq+strideq*0]
+ vinserti32x4 ym2, [srcq+strideq*1], 1
+ vinserti32x4 m2, [srcq+strideq*2], 2
+ vinserti32x4 m2, [srcq+r6 ], 3
+ lea srcq, [srcq+strideq*4]
+ mova m0, m10
+ pshufb m1, m2, m4
+ vpdpwssd m0, m6, m1
+ pshufb m2, m5
+ vpdpwssd m0, m7, m2
+ vpermb m0, m9, m0
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m10, [prep_8tap_rnd]
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mov r5d, r7m
+ sub srcq, 6
+ shr r5d, 11
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ mova [tmpq], xmm0
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ cmp wd, 16
+ je .h_w16
+ jg .h_w32
+.h_w8:
+ mova m6, [spel_h_shufA]
+ movu m7, [spel_h_shufB]
+ movu m8, [spel_h_shufC]
+ mova m9, [spel_h_shufD]
+ mova m11, [prep_endB]
+.h_w8_loop:
+ movu ym4, [srcq+strideq*0]
+ vinserti32x8 m4, [srcq+strideq*1], 1
+ movu ym5, [srcq+strideq*2]
+ vinserti32x8 m5, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ mova m0, m10
+ mova m1, m10
+ vpermb m2, m6, m4
+ vpermb m3, m6, m5
+ vpdpwssd m0, m12, m2
+ vpdpwssd m1, m12, m3
+ vpermb m2, m7, m4
+ vpermb m3, m7, m5
+ vpdpwssd m0, m13, m2
+ vpdpwssd m1, m13, m3
+ vpermb m2, m8, m4
+ vpermb m3, m8, m5
+ vpdpwssd m0, m14, m2
+ vpdpwssd m1, m14, m3
+ vpermb m2, m9, m4
+ vpermb m3, m9, m5
+ vpdpwssd m0, m15, m2
+ vpdpwssd m1, m15, m3
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ mova m11, [prep_endC]
+.h_w16_loop:
+ movu ym2, [srcq+strideq*0+ 0]
+ vinserti32x8 m2, [srcq+strideq*1+ 0], 1
+ movu ym3, [srcq+strideq*0+16]
+ vinserti32x8 m3, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ mova m0, m10
+ mova m1, m10
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m14, m4 ; b2
+ pshufb m4, m2, m7
+ vpdpwssd m0, m13, m4 ; a1
+ pshufb m4, m3, m7
+ vpdpwssd m1, m15, m4 ; b3
+ shufpd m2, m3, 0x55
+ pshufb m4, m2, m6
+ vpdpwssd m0, m14, m4 ; a2
+ vpdpwssd m1, m12, m4 ; b0
+ pshufb m2, m7
+ vpdpwssd m0, m15, m2 ; a3
+ vpdpwssd m1, m13, m2 ; b1
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ neg wq
+ mova m11, [prep_endC]
+.h_w32_loop0:
+ mov r6, wq
+.h_w32_loop:
+ movu m2, [srcq+r6*2+ 0]
+ movu m3, [srcq+r6*2+ 8]
+ mova m0, m10
+ mova m1, m10
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m12, m4 ; b0
+ vpdpwssd m0, m14, m4 ; a2
+ movu m4, [srcq+r6*2+16]
+ pshufb m3, m7
+ vpdpwssd m1, m13, m3 ; b1
+ vpdpwssd m0, m15, m3 ; a3
+ pshufb m3, m4, m6
+ vpdpwssd m1, m14, m3 ; b2
+ pshufb m2, m7
+ vpdpwssd m0, m13, m2 ; a1
+ pshufb m4, m7
+ vpdpwssd m1, m15, m4 ; b3
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ add r6, 32
+ jl .h_w32_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mov r5d, r7m
+ vpbroadcastd m10, [prep_8tap_rnd]
+ pmovsxbw xmm0, [base+subpel_filters+myq*8]
+ tzcnt r6d, wd
+ shr r5d, 11
+ movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ add r7, r6
+ lea r6, [strideq*3]
+ sub srcq, r6
+ mova [tmpq], xmm0
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ jmp r7
+.v_w4:
+ movq xmm1, [srcq+strideq*0]
+ vpbroadcastq ymm0, [srcq+strideq*1]
+ vpbroadcastq ymm2, [srcq+strideq*2]
+ add srcq, r6
+ vpbroadcastq ymm4, [srcq+strideq*0]
+ vpbroadcastq ymm3, [srcq+strideq*1]
+ vpbroadcastq ymm5, [srcq+strideq*2]
+ mova xm11, [prep_endA]
+ add srcq, r6
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklwd ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+strideq*0]
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm3, 0x30
+ punpcklwd ymm2, ymm4 ; 23 34
+ vpblendd ymm3, ymm5, 0x30
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq ymm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ mova ymm4, ym10
+ vpdpwssd ymm4, ym12, ymm1 ; a0 b0
+ mova ymm1, ymm2
+ vpdpwssd ymm4, ym13, ymm2 ; a1 b1
+ mova ymm2, ymm3
+ vpdpwssd ymm4, ym14, ymm3 ; a2 b2
+ vpblendd ymm3, ymm0, ymm5, 0x30
+ vpbroadcastq ymm0, [srcq+strideq*0]
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 67 78
+ vpdpwssd ymm4, ym15, ymm3 ; a3 b3
+ vpermb ymm4, ym11, ymm4
+ mova [tmpq], xmm4
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ vbroadcasti32x4 m2, [srcq+strideq*2]
+ vinserti32x4 m1, m2, [srcq+strideq*0], 0
+ vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2
+ add srcq, r6
+ vinserti32x4 ym2, [srcq+strideq*0], 1
+ vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4
+ mova m6, [spel_v_shuf8]
+ movu xm0, [srcq+strideq*1]
+ vinserti32x4 ym0, [srcq+strideq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
+ mova ym11, [prep_endB]
+ vpermb m1, m6, m1 ; 01 12
+ vpermb m2, m6, m2 ; 23 34
+ vpermb m3, m6, m0 ; 45 56
+.v_w8_loop:
+ vinserti32x4 m0, [srcq+strideq*1], 3
+ lea srcq, [srcq+strideq*2]
+ movu xm5, [srcq+strideq*0]
+ mova m4, m10
+ vpdpwssd m4, m12, m1 ; a0 b0
+ mova m1, m2
+ vshufi32x4 m0, m5, q1032 ; 6 7 8
+ vpdpwssd m4, m13, m2 ; a1 b1
+ mova m2, m3
+ vpdpwssd m4, m14, m3 ; a2 b2
+ vpermb m3, m6, m0 ; 67 78
+ vpdpwssd m4, m15, m3 ; a3 b3
+ vpermb m4, m11, m4
+ mova [tmpq], ym4
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti32x8 m1, [srcq+strideq*1]
+ vinserti32x8 m0, m1, [srcq+strideq*0], 0
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ mova m8, [spel_v_shuf16]
+ add srcq, r6
+ movu ym3, [srcq+strideq*0]
+ vinserti32x8 m3, [srcq+strideq*1], 1
+ movu ym5, [srcq+strideq*2]
+ add srcq, r6
+ vinserti32x8 m5, [srcq+strideq*0], 1
+ mova m11, [prep_endA]
+ vpermb m0, m8, m0 ; 01
+ vpermb m1, m8, m1 ; 12
+ vpermb m3, m8, m3 ; 34
+ vpermb m5, m8, m5 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23
+ vpshrdd m4, m3, m5, 16 ; 45
+.v_w16_loop:
+ mova m6, m10
+ mova m7, m10
+ vpdpwssd m6, m12, m0 ; a0
+ mova m0, m2
+ vpdpwssd m7, m12, m1 ; b0
+ mova m1, m3
+ vpdpwssd m6, m13, m2 ; a1
+ mova m2, m4
+ vpdpwssd m7, m13, m3 ; b1
+ mova m3, m5
+ vpdpwssd m6, m14, m4 ; a2
+ mova m4, m5
+ vpdpwssd m7, m14, m5 ; b2
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m5, [srcq+strideq*0], 1
+ vpermb m5, m8, m5 ; 78
+ vpshrdd m4, m5, 16 ; 67
+ vpdpwssd m6, m15, m4 ; a3
+ vpdpwssd m7, m15, m5 ; b3
+ vpermt2b m6, m11, m7
+ mova [tmpq], m6
+ add tmpq, 64
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r8
+ movaps [rsp+stack_offset+8], xmm6
+%endif
+ lea r5, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, tmpq
+.v_w32_loop0:
+ movu m16, [srcq+strideq*0]
+ movu m17, [srcq+strideq*1]
+ movu m18, [srcq+strideq*2]
+ add srcq, r6
+ movu m19, [srcq+strideq*0]
+ movu m20, [srcq+strideq*1]
+ movu m21, [srcq+strideq*2]
+ add srcq, r6
+ movu m22, [srcq+strideq*0]
+ mova m11, [prep_endC]
+ punpcklwd m0, m16, m17 ; 01l
+ punpckhwd m16, m17 ; 01h
+ punpcklwd m1, m17, m18 ; 12l
+ punpckhwd m17, m18 ; 12h
+ punpcklwd m2, m18, m19 ; 23l
+ punpckhwd m18, m19 ; 23h
+ punpcklwd m3, m19, m20 ; 34l
+ punpckhwd m19, m20 ; 34h
+ punpcklwd m4, m20, m21 ; 45l
+ punpckhwd m20, m21 ; 45h
+ punpcklwd m5, m21, m22 ; 56l
+ punpckhwd m21, m22 ; 56h
+.v_w32_loop:
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; a0l
+ mova m8, m10
+ vpdpwssd m8, m12, m16 ; a0h
+ mova m7, m10
+ vpdpwssd m7, m12, m1 ; b0l
+ mova m9, m10
+ vpdpwssd m9, m12, m17 ; b0h
+ mova m0, m2
+ vpdpwssd m6, m13, m2 ; a1l
+ mova m16, m18
+ vpdpwssd m8, m13, m18 ; a1h
+ mova m1, m3
+ vpdpwssd m7, m13, m3 ; b1l
+ mova m17, m19
+ vpdpwssd m9, m13, m19 ; b1h
+ mova m2, m4
+ vpdpwssd m6, m14, m4 ; a2l
+ mova m18, m20
+ vpdpwssd m8, m14, m20 ; a2h
+ mova m3, m5
+ vpdpwssd m7, m14, m5 ; b2l
+ mova m19, m21
+ vpdpwssd m9, m14, m21 ; b2h
+ movu m21, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklwd m4, m22, m21 ; 67l
+ punpckhwd m20, m22, m21 ; 67h
+ movu m22, [srcq+strideq*0]
+ vpdpwssd m6, m15, m4 ; a3l
+ vpdpwssd m8, m15, m20 ; a3h
+ punpcklwd m5, m21, m22 ; 78l
+ punpckhwd m21, m22 ; 78h
+ vpdpwssd m7, m15, m5 ; b3l
+ vpdpwssd m9, m15, m21 ; b3h
+ vpermt2b m6, m11, m8
+ vpermt2b m7, m11, m9
+ mova [tmpq+wq*0], m6
+ mova [tmpq+wq*2], m7
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, r5b
+ mov srcq, r7
+ mov tmpq, r8
+ sub r5d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ movaps xmm6, [rsp+stack_offset+8]
+ POP r8
+%endif
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mov r5d, r7m
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ shr r5d, 11
+ sub srcq, r6
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ psllw xmm1, 2
+ vpbroadcastd m10, [prep_8tap_rnd]
+ vpbroadcastd ym11, [pd_128]
+ mova xm21, [prep_endA]
+ mova [tmpq+ 0], xmm0
+ mova [tmpq+16], xmm1
+ vpbroadcastd m8, [tmpq+ 4]
+ vpbroadcastd m9, [tmpq+ 8]
+ vpbroadcastd ym12, xmm1
+ vpbroadcastd ym13, [tmpq+20]
+ vpbroadcastd ym14, [tmpq+24]
+ vpbroadcastd ym15, [tmpq+28]
+ movu xm4, [srcq+strideq*0]
+ vinserti32x4 ym4, [srcq+strideq*1], 1
+ vinserti32x4 m4, [srcq+strideq*2], 2
+ add srcq, r6
+ vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3
+ movu xm0, [srcq+strideq*1]
+ vinserti32x4 ym0, [srcq+strideq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
+ vbroadcasti32x4 m19, [spel_h_shufA]
+ vbroadcasti32x4 m20, [spel_h_shufB]
+ mova ym6, [spel_shuf4a]
+ mova ym7, [spel_shuf4b]
+ mova m2, m10
+ mova m3, m10
+ pshufb m1, m4, m19
+ vpdpwssd m2, m8, m1
+ pshufb m1, m0, m19
+ vpdpwssd m3, m8, m1
+ pshufb m4, m20
+ vpdpwssd m2, m9, m4
+ pshufb m0, m20
+ vpdpwssd m3, m9, m0
+ vpermb m1, m6, m2 ; 01 12
+ vshufi32x4 m2, m3, q1032
+ vpermb m3, m6, m3 ; 45 56
+ vpermb m2, m6, m2 ; 23 34
+.hv_w4_loop:
+ movu xm18, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 ym18, [srcq+strideq*0], 1
+ mova ym16, ym11
+ mova ym4, ym10
+ pshufb ym17, ym18, ym19
+ vpdpwssd ym16, ym12, ym1 ; a0 b0
+ vpdpwssd ym4, ym8, ym17
+ pshufb ym18, ym20
+ mova ym1, ym2
+ vpdpwssd ym16, ym13, ym2 ; a1 b1
+ vpdpwssd ym4, ym9, ym18 ; 7 8
+ mova ym2, ym3
+ vpdpwssd ym16, ym14, ym3 ; a2 b2
+ vpermt2b ym3, ym7, ym4 ; 67 78
+ vpdpwssd ym16, ym15, ym3 ; a3 b3
+ vpermb ym16, ym21, ym16
+ mova [tmpq], xm16
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ mov r5d, r7m
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 6
+ shr r5d, 11
+ sub srcq, r6
+ vpbroadcastd m10, [prep_8tap_rnd]
+ vpbroadcastd m11, [pd_128]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ psllw xmm1, 2
+ mova [tmpq+ 0], xmm0
+ mova [tmpq+16], xmm1
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ vpbroadcastd m16, xmm1
+ vpbroadcastd m17, [tmpq+20]
+ vpbroadcastd m18, [tmpq+24]
+ vpbroadcastd m19, [tmpq+28]
+ cmp wd, 16
+ je .hv_w16
+ jg .hv_w32
+ WIN64_SPILL_XMM 23
+ mova m5, [spel_h_shufA]
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1
+ movu ym9, [srcq+strideq*2]
+ add srcq, r6
+ vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3
+ movu ym20, [srcq+strideq*1]
+ vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5
+ add srcq, r6
+ movu ym21, [srcq+strideq*0] ; 6
+ movu m6, [spel_h_shufB]
+ movu m7, [spel_h_shufC]
+ mova ym22, [prep_endB]
+ vpermb m8, m5, m0
+ mova m1, m10
+ vpdpwssd m1, m12, m8 ; a0 b0
+ vpermb m8, m5, m9
+ mova m2, m10
+ vpdpwssd m2, m12, m8 ; c0 d0
+ vpermb m8, m5, m20
+ mova m3, m10
+ vpdpwssd m3, m12, m8 ; e0 f0
+ vpermb m8, m5, m21
+ mova m4, m10
+ vpdpwssd m4, m12, m8 ; g0
+ vpermb m8, m6, m0
+ vpdpwssd m1, m13, m8 ; a1 b1
+ vpermb m8, m6, m9
+ vpdpwssd m2, m13, m8 ; c1 d1
+ vpermb m8, m6, m20
+ vpdpwssd m3, m13, m8 ; e1 f1
+ vpermb m8, m6, m21
+ vpdpwssd m4, m13, m8 ; g1
+ vpermb m8, m7, m0
+ vpdpwssd m1, m14, m8 ; a2 b2
+ vpermb m8, m7, m9
+ vpdpwssd m2, m14, m8 ; c2 d2
+ vpermb m8, m7, m20
+ vpdpwssd m3, m14, m8 ; e2 f2
+ vpermb m8, m7, m21
+ vpdpwssd m4, m14, m8 ; g2
+ mova m8, [spel_h_shufD]
+ vpermb m0, m8, m0
+ vpdpwssd m1, m15, m0 ; a3 b3
+ mova m0, [spel_shuf8a]
+ vpermb m9, m8, m9
+ vpdpwssd m2, m15, m9 ; c3 d3
+ mova m9, [spel_shuf8b]
+ vpermb m20, m8, m20
+ vpdpwssd m3, m15, m20 ; e3 f3
+ vpermb m21, m8, m21
+ vpdpwssd m4, m15, m21 ; g3
+ vpermt2b m1, m0, m2 ; 01 12
+ vpermt2b m2, m0, m3 ; 23 34
+ vpermt2b m3, m0, m4 ; 45 56
+.hv_w8_loop:
+ movu ym0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*0], 1
+ mova m4, m10
+ mova m20, m11
+ vpermb m21, m5, m0
+ vpdpwssd m4, m12, m21 ; h0 i0
+ vpermb m21, m6, m0
+ vpdpwssd m20, m16, m1 ; A0 B0
+ vpdpwssd m4, m13, m21 ; h1 i1
+ vpermb m21, m7, m0
+ mova m1, m2
+ vpdpwssd m20, m17, m2 ; A1 B1
+ vpdpwssd m4, m14, m21 ; h2 i2
+ vpermb m21, m8, m0
+ mova m2, m3
+ vpdpwssd m20, m18, m3 ; A2 B2
+ vpdpwssd m4, m15, m21 ; h3 i3
+ vpermt2b m3, m9, m4 ; 67 78
+ vpdpwssd m20, m19, m3 ; A3 B3
+ vpermb m20, m22, m20
+ mova [tmpq], ym20
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 27
+ vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
+ vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
+ vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0
+ movu ym6, [srcq+strideq*1+ 0]
+ movu ym7, [srcq+strideq*1+16]
+ vinserti32x8 m6, [srcq+strideq*2+ 0], 1
+ vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2
+ add srcq, r6
+ movu ym22, [srcq+strideq*0+ 0]
+ movu ym23, [srcq+strideq*0+16]
+ vinserti32x8 m22, [srcq+strideq*1+ 0], 1
+ vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4
+ movu ym24, [srcq+strideq*2+ 0]
+ movu ym25, [srcq+strideq*2+16]
+ add srcq, r6
+ vinserti32x8 m24, [srcq+strideq*0+ 0], 1
+ vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m9, [spel_shuf16]
+ mova m26, [prep_endB]
+ pshufb m0, m4, m20
+ mova m1, m10
+ vpdpwssd m1, m12, m0 ; a0
+ pshufb m0, m6, m20
+ mova m2, m10
+ vpdpwssd m2, m12, m0 ; b0
+ pshufb m0, m7, m20
+ mova m3, m10
+ vpdpwssd m3, m14, m0 ; c2
+ pshufb m0, m4, m21
+ vpdpwssd m1, m13, m0 ; a1
+ pshufb m0, m6, m21
+ vpdpwssd m2, m13, m0 ; b1
+ pshufb m0, m7, m21
+ vpdpwssd m3, m15, m0 ; c3
+ pshufb m0, m5, m20
+ vpdpwssd m1, m14, m0 ; a2
+ shufpd m6, m7, 0x55
+ pshufb m7, m6, m20
+ vpdpwssd m2, m14, m7 ; b2
+ vpdpwssd m3, m12, m7 ; c0
+ pshufb m5, m21
+ vpdpwssd m1, m15, m5 ; a3
+ pshufb m6, m21
+ vpdpwssd m2, m15, m6 ; b3
+ vpdpwssd m3, m13, m6 ; c1
+ pshufb m0, m22, m20
+ mova m4, m10
+ vpdpwssd m4, m12, m0 ; d0
+ pshufb m0, m23, m20
+ mova m5, m10
+ vpdpwssd m5, m14, m0 ; e2
+ pshufb m0, m24, m20
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; f0
+ pshufb m0, m25, m20
+ mova m7, m10
+ vpdpwssd m7, m14, m0 ; g2
+ pshufb m0, m22, m21
+ vpdpwssd m4, m13, m0 ; d1
+ pshufb m0, m23, m21
+ vpdpwssd m5, m15, m0 ; e3
+ pshufb m0, m24, m21
+ vpdpwssd m6, m13, m0 ; f1
+ pshufb m0, m25, m21
+ vpdpwssd m7, m15, m0 ; g3
+ shufpd m22, m23, 0x55
+ pshufb m23, m22, m20
+ vpdpwssd m4, m14, m23 ; d2
+ vpdpwssd m5, m12, m23 ; e0
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m6, m14, m25 ; f2
+ vpdpwssd m7, m12, m25 ; g0
+ pshufb m22, m21
+ vpdpwssd m4, m15, m22 ; d3
+ vpdpwssd m5, m13, m22 ; e1
+ pshufb m24, m21
+ vpdpwssd m6, m15, m24 ; f3
+ vpdpwssd m7, m13, m24 ; g1
+ pslldq m1, 1
+ vpermt2b m2, m9, m3 ; 12
+ vpermt2b m4, m9, m5 ; 34
+ vpermt2b m6, m9, m7 ; 56
+ vpshrdd m1, m2, 16 ; 01
+ vpshrdd m3, m2, m4, 16 ; 23
+ vpshrdd m5, m4, m6, 16 ; 45
+.hv_w16_loop:
+ movu ym24, [srcq+strideq*1+ 0]
+ movu ym25, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m24, [srcq+strideq*0+ 0], 1
+ vinserti32x8 m25, [srcq+strideq*0+16], 1
+ mova m7, m10
+ mova m8, m10
+ pshufb m0, m24, m20
+ vpdpwssd m7, m12, m0 ; h0
+ mova m22, m11
+ pshufb m0, m25, m20
+ vpdpwssd m8, m14, m0 ; i2
+ mova m23, m11
+ vpdpwssd m22, m16, m1 ; A0
+ mova m1, m3
+ vpdpwssd m23, m16, m2 ; B0
+ mova m2, m4
+ pshufb m0, m24, m21
+ vpdpwssd m7, m13, m0 ; h1
+ pshufb m0, m25, m21
+ vpdpwssd m8, m15, m0 ; i3
+ vpdpwssd m22, m17, m3 ; A1
+ mova m3, m5
+ vpdpwssd m23, m17, m4 ; B1
+ mova m4, m6
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m7, m14, m25 ; h2
+ vpdpwssd m8, m12, m25 ; i0
+ vpdpwssd m22, m18, m5 ; A2
+ vpdpwssd m23, m18, m6 ; B2
+ pshufb m24, m21
+ vpdpwssd m7, m15, m24 ; h3
+ vpdpwssd m8, m13, m24 ; i1
+ vpermt2b m7, m9, m8 ; 78
+ vpshrdd m5, m6, m7, 16 ; 67
+ vpdpwssd m22, m19, m5 ; A3
+ vpdpwssd m23, m19, m7 ; B3
+ mova m6, m7
+ vpermt2b m22, m26, m23
+ mova [tmpq], m22
+ add tmpq, 64
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ PUSH r8
+ %assign regs_used regs_used + 1
+ WIN64_SPILL_XMM 32
+%endif
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m22, [spel_shuf32]
+ lea r5d, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, tmpq
+.hv_w32_loop0:
+ movu m6, [srcq+strideq*0+ 0]
+ movu m7, [srcq+strideq*0+ 8]
+ movu m8, [srcq+strideq*0+16]
+ mova m0, m10
+ mova m23, m10
+ pshufb m9, m6, m20
+ vpdpwssd m0, m12, m9 ; a0l
+ pshufb m9, m7, m20
+ vpdpwssd m23, m12, m9 ; a0h
+ vpdpwssd m0, m14, m9 ; a2l
+ pshufb m7, m21
+ vpdpwssd m23, m13, m7 ; a1h
+ vpdpwssd m0, m15, m7 ; a3l
+ pshufb m7, m8, m20
+ vpdpwssd m23, m14, m7 ; a2h
+ pshufb m6, m21
+ vpdpwssd m0, m13, m6 ; a1l
+ pshufb m8, m21
+ vpdpwssd m23, m15, m8 ; a3h
+ PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12
+ PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34
+ PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23l
+ vpshrdd m25, m24, m26, 16 ; 23h
+ vpshrdd m4, m3, m5, 16 ; 45l
+ vpshrdd m27, m26, m28, 16 ; 45h
+.hv_w32_loop:
+ movu m7, [srcq+strideq*1+ 0]
+ movu m9, [srcq+strideq*2+ 0]
+ movu m6, [srcq+strideq*1+ 8]
+ movu m8, [srcq+strideq*2+ 8]
+ mova m29, m10
+ mova m31, m10
+ pshufb m30, m7, m20
+ vpdpwssd m29, m12, m30 ; h0l
+ pshufb m30, m9, m20
+ vpdpwssd m31, m12, m30 ; i0l
+ pshufb m7, m21
+ vpdpwssd m29, m13, m7 ; h1l
+ pshufb m9, m21
+ vpdpwssd m31, m13, m9 ; i1l
+ pshufb m7, m6, m20
+ vpdpwssd m29, m14, m7 ; h2l
+ pshufb m9, m8, m20
+ vpdpwssd m31, m14, m9 ; i2l
+ pshufb m6, m21
+ vpdpwssd m29, m15, m6 ; h3l
+ pshufb m8, m21
+ vpdpwssd m31, m15, m8 ; i3l
+ mova m30, m10
+ vpdpwssd m30, m12, m7 ; h0h
+ movu m7, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*2]
+ vpermt2b m29, m22, m31 ; 78l
+ mova m31, m10
+ vpdpwssd m31, m12, m9 ; i0h
+ movu m9, [srcq+strideq*0+16]
+ vpdpwssd m30, m13, m6 ; h1h
+ pshufb m6, m7, m20
+ vpdpwssd m31, m13, m8 ; i1h
+ pshufb m8, m9, m20
+ vpdpwssd m30, m14, m6 ; h2h
+ mova m6, m11
+ vpdpwssd m6, m16, m0 ; A0l
+ pshufb m7, m21
+ vpdpwssd m31, m14, m8 ; i2h
+ mova m8, m11
+ vpdpwssd m8, m16, m23 ; A0h
+ pshufb m9, m21
+ vpdpwssd m30, m15, m7 ; h3h
+ mova m7, m11
+ vpdpwssd m7, m16, m1 ; B0l
+ vpdpwssd m31, m15, m9 ; i3h
+ mova m9, m11
+ vpdpwssd m9, m16, m24 ; B0h
+ mova m0, m2
+ vpdpwssd m6, m17, m2 ; A1l
+ mova m23, m25
+ vpdpwssd m8, m17, m25 ; A1h
+ mova m1, m3
+ vpdpwssd m7, m17, m3 ; B1l
+ mova m24, m26
+ vpdpwssd m9, m17, m26 ; B1h
+ vpermt2b m30, m22, m31 ; 78h
+ mova m31, [prep_endC]
+ vpdpwssd m6, m18, m4 ; A2l
+ mova m2, m4
+ vpdpwssd m8, m18, m27 ; A2h
+ mova m25, m27
+ vpdpwssd m7, m18, m5 ; B2l
+ mova m3, m5
+ vpdpwssd m9, m18, m28 ; B2h
+ mova m26, m28
+ vpshrdd m4, m5, m29, 16 ; 67l
+ vpdpwssd m6, m19, m4 ; A3l
+ vpshrdd m27, m28, m30, 16 ; 67h
+ vpdpwssd m8, m19, m27 ; A3h
+ mova m5, m29
+ vpdpwssd m7, m19, m29 ; B3l
+ mova m28, m30
+ vpdpwssd m9, m19, m30 ; B3h
+ vpermt2b m6, m31, m8
+ vpermt2b m7, m31, m9
+ mova [tmpq+wq*0], m6
+ mova [tmpq+wq*2], m7
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, r5b
+ mov srcq, r7
+ mov tmpq, r8
+ sub r5d, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
+%define base r6-pd_0to7
+ mov t0d, r7m
+ lea r6, [pd_0to7]
+ shr t0d, 11
+ vpbroadcastd m8, [base+warp_8x8t_rnd_v]
+ vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
+ psrad m14, m16, 15
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ psrad m16, 15
+ packssdw m14, m16
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ psrad m15, m16, 15
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ add tsq, tsq
+ psrad m16, 15
+ packssdw m15, m16
+ jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
+
+cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
+ mov t0d, r7m ; pixel_max
+ lea r6, [pd_0to7]
+ shr t0d, 11
+ vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
+ vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4]
+ call .main
+ psrad m14, m16, 13
+ call .main2
+ psrad m16, 13
+ packusdw m14, m16
+ call .main2
+ psrad m15, m16, 13
+ call .main2
+ vpbroadcastd m0, [base+bidir_shift+t0*4]
+ vpsrlvw m14, m0
+ psrad m16, 13
+ packusdw m15, m16
+ vpsrlvw m15, m0
+.end:
+ mova m0, [base+warp8x8_end]
+ vpermb m16, m0, m14
+ lea r2, [dsq*3]
+ mova [dstq+dsq*0], xm16
+ vextracti128 [dstq+dsq*1], ym16, 1
+ vextracti32x4 [dstq+dsq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ vpermb m16, m0, m15
+ lea dstq, [dstq+dsq*4]
+ mova [dstq+dsq*0], xm16
+ vextracti128 [dstq+dsq*1], ym16, 1
+ vextracti32x4 [dstq+dsq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ RET
+.main:
+ vpbroadcastd ym3, [base+pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym3, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym3, r7m {1to8} ; my
+ mova ym16, [base+pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0] ; alpha
+ vpbroadcastd ym21, [abcdq+4*1] ; gamma
+ lea r4, [ssq*3+6]
+ vpdpwssd ym18, ym19, ym16 ; tmx
+ vpdpwssd ym20, ym21, ym16 ; tmy
+ sub srcq, r4
+ mova m10, [base+warp8x8_permA]
+ lea r4, [mc_warp_filter+64*8]
+ vbroadcasti32x4 m12, [base+warp8x8_permC]
+ kxnorb k1, k1, k1
+ vbroadcasti32x4 m13, [base+warp8x8_permD]
+ movu ym5, [srcq+0]
+ vinserti32x8 m5, [srcq+8], 1
+ psrad ym17, ym18, 10
+ mova m11, [base+warp8x8_permB]
+ kmovb k2, k1
+ vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ paddd ym18, ym19
+ vpermb m4, m10, m5
+ vpbroadcastq m9, [base+warp_shift_h+t0*8]
+ pshufd m3, m3, q3120
+ paddd m7, m1, m1
+ pshufb m2, m3, m12
+ vpdpwssd m1, m4, m2
+ vpermb m5, m11, m5
+ vshufi32x4 m4, m5, q1021
+ pshufb m3, m13
+ vpdpwssd m1, m4, m3
+ call .h
+ psllq m2, m1, 32
+ paddd m1, m2
+ vpmultishiftqb m1, m9, m1
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym6, ym20, 10
+ kmovb k1, k2
+ paddd ym17, ym20, ym21 ; my += delta
+ vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0
+ psrad ym16, ym17, 10
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1
+ shufps m5, m20, m6, q2020
+ mova m16, m8
+ pshufb m4, m5, m12
+ vpdpwssd m16, m1, m4 ; a0 b0
+ pshufb m5, m13
+ mova m1, m2
+ vpdpwssd m16, m2, m5 ; a1 b1
+ shufps m6, m20, m6, q3131
+ paddd ym20, ym17, ym21
+ pshufb m4, m6, m12
+ mova m2, m3
+ vpdpwssd m16, m3, m4 ; a2 b2
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m6, m13
+ vpdpwssd m16, m3, m6 ; a3 b3
+ ret
+ALIGN function_align
+.h:
+ movu ym16, [srcq+ssq*1]
+ psrad ym6, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m5, m16, [srcq+ssq*0], 1
+ kmovb k1, k2
+ paddd ym17, ym18, ym19 ; mx += beta
+ vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1
+ psrad ym16, ym17, 10
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2
+ vpermb m4, m10, m5
+ shufps m16, m18, m6, q2020
+ shufps m6, m18, m6, q3131
+ mova m0, m7
+ pshufb m18, m16, m12
+ vpdpwssd m0, m4, m18 ; a0 b0
+ vpermb m5, m11, m5
+ pshufb m18, m6, m13
+ vpdpwssd m0, m5, m18 ; a3 b3
+ paddd ym18, ym17, ym19
+ vshufi32x4 m17, m4, m5, q1021
+ pshufb m16, m13
+ vpdpwssd m0, m17, m16 ; a1 b1
+ vshufi32x4 m4, m5, q2132
+ pshufb m6, m12
+ vpdpwssd m0, m4, m6 ; a2 b2
+ vpmultishiftqb m0, m9, m0 ; a a b b
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm0, ym1, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ vextracti32x4 xm0, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m2, [base+avg_round+t0*4]
+ vpbroadcastd m3, [base+avg_shift+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+64*0]
+ paddsw m0, [tmp2q+64*0]
+ mova m1, [tmp1q+64*1]
+ paddsw m1, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ vpsrlvw m0, m3
+ vpsrlvw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m5, [base+w_avg_round+t0*4]
+ vpbroadcastd m7, [base+bidir_shift+t0*4]
+ add wq, r6
+ mov r6d, r6m ; weight
+ lea t0d, [r6-16]
+ shl r6d, 16
+ sub r6d, t0d ; 16-weight, weight
+ movifnidn hd, hm
+ vpbroadcastd m6, r6d
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m3, [tmp1q+64*0]
+ mova m1, [tmp2q+64*0]
+ mova m0, [tmp1q+64*1]
+ mova m4, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ mova m0, m5
+ vpdpwssd m0, m6, m2
+ mova m2, m5
+ vpdpwssd m2, m6, m1
+ mova m1, m5
+ vpdpwssd m1, m6, m3
+ mova m3, m5
+ vpdpwssd m3, m6, m4
+ REPX {psrad x, 2}, m0, m2, m1, m3
+ packusdw m0, m2
+ packusdw m1, m3
+ vpsrlvw m0, m7
+ vpsrlvw m1, m7
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+mask_round+r6*4]
+ vpbroadcastd m10, [base+bidir_shift+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+ pmovzxbw m1, [maskq+32*0]
+ mova m4, [tmp1q+64*0]
+ mova m2, [tmp2q+64*0]
+ pmovzxbw m6, [maskq+32*1]
+ mova m5, [tmp1q+64*1]
+ mova m3, [tmp2q+64*1]
+ add maskq, 32*2
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ punpcklwd m7, m4, m2
+ punpckhwd m4, m2
+ psubw m0, m8, m1
+ punpcklwd m2, m1, m0 ; m, 64-m
+ punpckhwd m1, m0
+ mova m0, m9
+ vpdpwssd m0, m7, m2
+ mova m2, m9
+ vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
+ punpcklwd m7, m5, m3
+ punpckhwd m5, m3
+ psubw m1, m8, m6
+ punpcklwd m3, m6, m1
+ punpckhwd m6, m1
+ mova m1, m9
+ vpdpwssd m1, m7, m3
+ mova m3, m9
+ vpdpwssd m3, m5, m6
+ REPX {psrad x, 4}, m0, m2, m1, m3
+ packusdw m0, m2
+ packusdw m1, m3
+ vpsrlvw m0, m10
+ vpsrlvw m1, m10
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+mask_round+r6*4]
+ vpbroadcastd m13, [base+bidir_shift+r6*4]
+ mov r6d, r7m ; sign
+ vpbroadcastd m14, [base+w_mask_round+r6*4]
+ mova ym15, [w_mask_end42x]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m4, [w_mask_shuf4]
+ vpermt2b m2, m4, m3
+ mova m3, m14
+ vpdpbusd m3, m2, [pb_64] {1to16}
+ vpermb m3, m15, m3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ mova [maskq], xm3
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8:
+ mova m8, [w_mask_shuf8]
+ vpbroadcastd m9, [pb_64]
+ jmp .w8_start
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8_start:
+ vpermt2b m2, m8, m3
+ mova m3, m14
+ vpdpbusd m3, m2, m9
+ vpermb m3, m15, m3
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ mova [maskq], xm3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16:
+ mova m8, [w_mask_shuf16]
+ vpbroadcastd m9, [pb_64]
+ jmp .w16_start
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16_start:
+ vpermt2b m2, m8, m3
+ mova m3, m14
+ vpdpbusd m3, m2, m9
+ vpermb m3, m15, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ mova [maskq], xm3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m2, m3
+ mova m8, m14
+ vpdpwssd m8, m11, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ call .main
+ paddw m2, m3
+ mova m3, m14
+ vpdpwssd m3, m11, m2
+ vpermt2b m8, m15, m3
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ mova [maskq], ym8
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ mova m8, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ call .main
+ paddw m8, m2
+ paddw m9, m3
+ mova m2, m14
+ vpdpwssd m2, m11, m8
+ mova m3, m14
+ vpdpwssd m3, m11, m9
+ vpermt2b m2, m15, m3
+ mova [dstq+strideq*1+64*0], m0
+ mova [dstq+strideq*1+64*1], m1
+ mova [maskq], ym2
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ mova m16, m2
+ mova m8, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ call .main
+ mova m17, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*2], m0
+ mova [dstq+strideq*0+64*3], m1
+ call .main
+ paddw m2, m16
+ paddw m3, m8
+ mova m16, m14
+ vpdpwssd m16, m11, m2
+ mova m8, m14
+ vpdpwssd m8, m11, m3
+ mova [dstq+strideq*1+64*0], m0
+ mova [dstq+strideq*1+64*1], m1
+ call .main
+ paddw m2, m17
+ paddw m3, m9
+ mova m17, m14
+ vpdpwssd m17, m11, m2
+ mova m9, m14
+ vpdpwssd m9, m11, m3
+ vpermt2b m16, m15, m8
+ vpermt2b m17, m15, m9
+ mova [dstq+strideq*1+64*2], m0
+ mova [dstq+strideq*1+64*3], m1
+ mova [maskq+32*0], ym16
+ mova [maskq+32*1], ym17
+ sub hd, 2
+ jg .w128_loop
+ vzeroupper
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m10, m6
+ psrlw m6, 10 ; 64-m
+ psubw m2, m11, m6 ; m
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m12
+ vpdpwssd m0, m5, m1
+ mova m1, m12
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m10, m5
+ psrlw m5, 10
+ psubw m3, m11, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m12
+ vpdpwssd m1, m6, m4
+ mova m4, m12
+ vpdpwssd m4, m7, m5
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpsrlvw m0, m13
+ vpsrlvw m1, m13
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m9, [base+pw_64]
+ vpbroadcastd m10, [base+mask_round+r6*4]
+ vpbroadcastd m11, [base+bidir_shift+r6*4]
+ mov r6d, r7m ; sign
+ vpbroadcastd m12, [base+w_mask_round+r6*4]
+ mova ym13, [w_mask_end42x]
+ mov maskq, maskmp
+ add wq, r7
+ paddw m14, m9, m9 ; pw_128
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m8, m6
+ psrlw m6, 10
+ psubw m2, m9, m6
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m10
+ vpdpwssd m0, m5, m1
+ mova m1, m10
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m8, m5
+ psrlw m5, 10
+ psubw m3, m9, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m10
+ vpdpwssd m1, m6, m4
+ mova m4, m10
+ vpdpwssd m4, m7, m5
+ mova m5, m12
+ vpdpwssd m5, m14, m2
+ mova m2, m12
+ vpdpwssd m2, m14, m3
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpermt2b m5, m13, m2
+ vpsrlvw m0, m11
+ vpsrlvw m1, m11
+ mova [maskq], ym5
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m9, [base+pw_64]
+ vpbroadcastd m10, [base+mask_round+r6*4]
+ mova m11, [w_mask_end444]
+ vpbroadcastd m12, [base+bidir_shift+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m8, m6
+ psrlw m6, 10
+ psubw m2, m9, m6
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m10
+ vpdpwssd m0, m5, m1
+ mova m1, m10
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m8, m5
+ psrlw m5, 10
+ psubw m3, m9, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m10
+ vpdpwssd m1, m6, m4
+ mova m4, m10
+ vpdpwssd m4, m7, m5
+ vpermt2b m2, m11, m3
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpsrlvw m0, m12
+ vpsrlvw m1, m12
+ mova [maskq], m2
+ add maskq, 64
+ ret
+
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw ym19, [maskq]
+ movq xm16, [dstq+dsq*0]
+ movhps xm16, [dstq+dsq*1]
+ vpbroadcastq ym17, [dstq+dsq*2]
+ vpbroadcastq ym18, [dstq+r6 ]
+ pmullw ym19, ym6
+ vpblendd ym16, ym17, 0x30
+ vpblendd ym16, ym18, 0xc0
+ psubw ym17, ym16, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmulhrsw ym17, ym19
+ paddw ym16, ym17
+ vextracti128 xm17, ym16, 1
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ movq [dstq+dsq*2], xm17
+ movhps [dstq+r6 ], xm17
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ vzeroupper
+ RET
+.w8:
+ pmovzxbw m2, [maskq]
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ vinserti32x4 m0, [dstq+dsq*2], 2
+ vinserti32x4 m0, [dstq+r6 ], 3
+ pmullw m2, m6
+ psubw m1, m0, [tmpq]
+ add maskq, 32
+ add tmpq, 64
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ mova ym1, [dstq+dsq*2]
+ vinserti32x8 m1, [dstq+r6 ], 1
+ pmullw m4, m6
+ pmullw m5, m6
+ psubw m2, m0, [tmpq+64*0]
+ psubw m3, m1, [tmpq+64*1]
+ add maskq, 32*2
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], ym1
+ vextracti32x8 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova m0, [dstq+dsq*0]
+ mova m1, [dstq+dsq*1]
+ pmullw m4, m6
+ pmullw m5, m6
+ psubw m2, m0, [tmpq+ 64*0]
+ psubw m3, m1, [tmpq+ 64*1]
+ add maskq, 32*2
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
+.w2_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ psubw xmm1, xmm0, xmm1
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
+.w4_loop:
+ movq xmm0, [dstq+dsq*0]
+ movhps xmm0, [dstq+dsq*1]
+ psubw xmm1, xmm0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ psubw ym1, ym0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw ym1, ym2
+ paddw ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
+.w16_loop:
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 32*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [obmc_masks_avx2+32*2]
+.w32_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 64*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+blend_h_avx512icl_table+wq*4]
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ lea wq, [base+blend_h_avx512icl_table+wq]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd xmm2, xmm2
+ psubw xmm1, xmm0, xmm1
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movq xmm0, [dstq+dsq*0]
+ movhps xmm0, [dstq+dsq*1]
+ movd xmm2, [maskq+hq*2]
+ psubw xmm1, xmm0, [tmpq]
+ add tmpq, 8*2
+ pshufb xmm2, xmm3
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym3, [blend_shuf]
+ shufpd ym3, ym3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ vpbroadcastd ym2, [maskq+hq*2]
+ psubw ym1, ym0, [tmpq]
+ add tmpq, 16*2
+ pshufb ym2, ym3
+ pmulhrsw ym1, ym2
+ paddw ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m3, [blend_shuf]
+ shufpd m3, m3, 0xf0
+.w16_loop:
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 32*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 64*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m2, m0, [tmpq+64*0]
+ mova m1, [dstq+64*1]
+ psubw m3, m1, [tmpq+64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m8, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m4, m0, [tmpq+64*0]
+ mova m1, [dstq+64*1]
+ psubw m5, m1, [tmpq+64*1]
+ mova m2, [dstq+64*2]
+ psubw m6, m2, [tmpq+64*2]
+ mova m3, [dstq+64*3]
+ psubw m7, m3, [tmpq+64*3]
+ add tmpq, 64*4
+ REPX {pmulhrsw x, m8}, m4, m5, m6, m7
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k6, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_16384]
+ vpbroadcastd m7, [base+pd_63]
+ mova m24, [base+resize_permA]
+ mova m25, [base+resize_permB]
+ mova m26, [base+resize_permC]
+ mova m27, [base+resize_permD]
+ vbroadcasti32x4 m28, [base+resize_shufA]
+ vbroadcasti32x4 m29, [base+resize_shufB]
+ mova m30, [base+resize_permE]
+ vpbroadcastw ym31, pxmaxm
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k5, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k5, k5
+ jz .load
+ vpbroadcastq m14, [base+pd_0_4]
+ vpermq m10, m0, q1100
+ vpermq m11, m0, q3322
+ vpermq m20, m1, q1100
+ vpermq m21, m1, q3322
+ punpckldq m10, m10
+ punpckldq m11, m11
+ punpckldq m20, m20
+ punpckldq m21, m21
+ paddd m10, m14
+ paddd m11, m14
+ paddd m20, m14
+ paddd m21, m14
+ vextracti32x8 ym12, m10, 1
+ vextracti32x8 ym13, m11, 1
+ vextracti32x8 ym22, m20, 1
+ vextracti32x8 ym23, m21, 1
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
+ vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
+ vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
+ vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
+ vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
+ vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
+ vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
+ pshufb m16, m0
+ pshufb m17, m1
+ pshufb m18, m14
+ pshufb m19, m15
+ mova m20, m24
+ mova m22, m24
+ mova m21, m25
+ mova m23, m25
+ vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
+ vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
+ vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
+ vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
+ mova m15, m26
+ mova m17, m26
+ mova m16, m27
+ mova m18, m27
+ vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
+ vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
+ vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
+ vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ jmp .filter
+.load:
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ vpgatherdd m15{k3}, [srcq+m0*2+ 0]
+ vpgatherdd m16{k4}, [srcq+m0*2+ 4]
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m17{k1}, [srcq+m0*2+ 8]
+ vpgatherdd m18{k2}, [srcq+m0*2+12]
+.filter:
+ mova m14, m2
+ vpdpwssd m14, m15, m10
+ vpdpwssd m14, m16, m11
+ vpdpwssd m14, m17, m12
+ vpdpwssd m14, m18, m13
+ psubd m14, m3, m14
+ psrad m14, 15
+ packusdw m14, m14
+ vpermq m14, m30, m14
+ pminsw ym14, ym31
+ mova [dstq+xq*2], ym14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
new file mode 100644
index 0000000000..fde8e372a3
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -0,0 +1,8731 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+; dav1d_obmc_masks[] << 9
+obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0
+ dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0
+ dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120
+ dw 4096, 3072, 2048, 1536, 0, 0, 0, 0
+ dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
+ dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608
+ dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024
+
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+pw_2: times 8 dw 2
+pw_16: times 4 dw 16
+prep_mul: times 4 dw 16
+ times 8 dw 4
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_2048: times 4 dw 2048
+bidir_mul: times 4 dw 2048
+pw_8192: times 8 dw 8192
+pw_27615: times 8 dw 27615
+pw_32766: times 8 dw 32766
+pw_m512: times 8 dw -512
+pd_63: times 4 dd 63
+pd_64: times 4 dd 64
+pd_512: times 4 dd 512
+pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000: times 4 dd 0x4000
+pq_0x400000: times 2 dq 0x400000
+pq_0x40000000: times 2 dq 0x40000000
+pd_65538: times 2 dd 65538
+
+put_bilin_h_rnd: times 4 dw 8
+ times 4 dw 10
+s_8tap_h_rnd: times 2 dd 2
+ times 2 dd 8
+put_s_8tap_v_rnd: times 2 dd 512
+ times 2 dd 128
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_sh: dd 10, 8
+bidir_rnd: times 4 dw -16400
+ times 4 dw -16388
+put_8tap_h_rnd: dd 34, 34, 40, 40
+prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
+
+warp8x8_shift: dd 11, 13
+warp8x8_rnd1: dd 1024, 1024, 4096, 4096
+warp8x8_rnd2: times 4 dw 4096
+ times 4 dw 16384
+warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+INIT_XMM ssse3
+cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
+%define base t0-put_ssse3
+ mov mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn wd, wm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ add wq, t0
+ movifnidn hd, hm
+ jmp wq
+.put_w2:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ add srcq, 16*8
+ add dstq, 16*8
+.put_w128_loop:
+ movu m0, [srcq-16*8]
+ movu m1, [srcq-16*7]
+ movu m2, [srcq-16*6]
+ movu m3, [srcq-16*5]
+ mova [dstq-16*8], m0
+ mova [dstq-16*7], m1
+ mova [dstq-16*6], m2
+ mova [dstq-16*5], m3
+ movu m0, [srcq-16*4]
+ movu m1, [srcq-16*3]
+ movu m2, [srcq-16*2]
+ movu m3, [srcq-16*1]
+ mova [dstq-16*4], m0
+ mova [dstq-16*3], m1
+ mova [dstq-16*2], m2
+ mova [dstq-16*1], m3
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128_loop
+ RET
+.h:
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ mova m4, [base+pw_16]
+ pshufb m5, [base+pw_256]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ mov r6d, r8m ; bitdepth_max
+ shr r6d, 11
+ movddup m3, [base+put_bilin_h_rnd+r6*8]
+ movifnidn hd, hm
+ sub wd, 8
+ jg .h_w16
+ je .h_w8
+ cmp wd, -4
+ je .h_w4
+.h_w2:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4, m1
+ psrlq m1, 16
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movd [dstq+dsq*0], m0
+ punpckhqdq m0, m0
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ movq m1, [srcq+ssq*0+2]
+ movhps m1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2+16*0], m0
+ mova [dstq+r6*2+16*1], m1
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ shl mxyd, 11
+ movd m5, mxyd
+ pshufb m5, [base+pw_256]
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movd m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq m0, [srcq+ssq*0]
+.v_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movq m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ mov r7, srcq
+ lea r6d, [wq+hq-256]
+ mov r4, dstq
+%else
+ mov r6, srcq
+%endif
+.v_w8_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w8_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+%if ARCH_X86_64
+ add r7, 16
+ add r4, 16
+ movzx hd, r6b
+ mov srcq, r7
+ mov dstq, r4
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .v_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ mova m3, [base+pw_2]
+ movd m6, mxyd
+ mova m7, [base+pw_8192]
+ pshufb m6, [base+pw_256]
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ mova m7, [base+pw_2048]
+.hv_12bpc:
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .hv_w8
+ je .hv_w4
+.hv_w2:
+ movddup m0, [srcq+ssq*0]
+ pshufhw m1, m0, q0321
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w2_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m2, [srcq+ssq*0]
+ pmullw m1, m4, m2
+ psrlq m2, 16
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 _ 2 _
+ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ movddup m0, [srcq+ssq*0]
+ movddup m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ movq m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ movhps m2, [srcq+ssq*0+2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ lea r6d, [wq+hq-256]
+ mov r4, srcq
+ mov r7, dstq
+%else
+ mov r6, srcq
+%endif
+.hv_w8_loop0:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m2, m5
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .hv_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
+%define base r6-prep_ssse3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep_ssse3
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ tzcnt wd, wd
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mov r5d, r7m ; bitdepth_max
+ mova m5, [base+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ movddup m4, [base+prep_mul+r5*8]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ dec hd
+ jg .prep_w32
+ RET
+.prep_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ movu m0, [srcq+16* 0]
+ movu m1, [srcq+16* 1]
+ movu m2, [srcq+16* 2]
+ movu m3, [srcq+16* 3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16* 4]
+ movu m1, [srcq+16* 5]
+ movu m2, [srcq+16* 6]
+ movu m3, [srcq+16* 7]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ movu m0, [srcq+16* 8]
+ movu m1, [srcq+16* 9]
+ movu m2, [srcq+16*10]
+ movu m3, [srcq+16*11]
+ add tmpq, 16*16
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*8], m0
+ mova [tmpq-16*7], m1
+ mova [tmpq-16*6], m2
+ mova [tmpq-16*5], m3
+ movu m0, [srcq+16*12]
+ movu m1, [srcq+16*13]
+ movu m2, [srcq+16*14]
+ movu m3, [srcq+16*15]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*4], m0
+ mova [tmpq-16*3], m1
+ mova [tmpq-16*2], m2
+ mova [tmpq-16*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd m4, mxyd
+ mov mxyd, r6m ; my
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ sub wd, 8
+ je .h_w8
+ jg .h_w16
+.h_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*0+2]
+ movhps m1, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movd m4, mxyd
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.v_12bpc:
+ cmp wd, 8
+ je .v_w8
+ jg .v_w16
+.v_w4:
+ movq m0, [srcq+strideq*0]
+.v_w4_loop:
+ movq m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklqdq m1, m0, m2 ; 0 1
+ movq m0, [srcq+strideq*0]
+ punpcklqdq m2, m0 ; 1 2
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu m0, [srcq+strideq*0]
+.v_w8_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+16*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.v_w16_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+wq*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+wq*2], m1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .v_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ shl mxyd, 11
+ movd m6, mxyd
+ pshufb m6, [base+pw_256]
+ cmp wd, 8
+ je .hv_w8
+ jg .hv_w16
+.hv_w4:
+ movddup m0, [srcq+strideq*0]
+ movddup m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ movhps m1, [srcq+strideq*0]
+ movhps m2, [srcq+strideq*0+2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+16*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+wq*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .hv_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2, 6
+%elif WIN64
+DECLARE_REG_TMP 4, 5, 8
+%else
+DECLARE_REG_TMP 7, 8, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r1b
+%define myd r1
+%define myq r1
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%else
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%endif
+%define base t2-put_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, put_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ add wq, t2
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov myd, r8m
+ movd m5, r8m
+ shr myd, 11
+ movddup m4, [base+put_8tap_h_rnd+myq*8]
+ movifnidn dsq, dsmp
+ pshufb m5, [base+pw_256]
+ cmp wd, 4
+ jg .h_w8
+ movzx mxd, mxb
+ lea srcq, [srcq-2]
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ je .h_w4
+.h_w2:
+ mova m2, [base+spel_h_shuf2]
+ pshufd m3, m3, q2121
+.h_w2_loop:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m2
+ pshufb m1, m2
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ phaddd m0, m1
+ paddd m0, m4
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movd [dstq+dsq*0], m0
+ pshuflw m0, m0, q3232
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ WIN64_SPILL_XMM 8
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q2222
+.h_w4_loop:
+ movu m1, [srcq]
+ add srcq, ssq
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movq [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+%endif
+ shr mxd, 16
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+%if UNIX64
+ mov wd, wd
+%endif
+ lea srcq, [srcq+wq*2]
+ punpcklbw m3, m3
+ lea dstq, [dstq+wq*2]
+ psraw m3, 8
+ neg wq
+%if ARCH_X86_32
+ ALLOC_STACK -16*4
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6*2- 6]
+ movu m1, [srcq+r6*2+ 2]
+ pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m8 ; abcd0
+ pmaddwd m0, m9 ; abcd1
+ pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m7 ; 6 7 7 8 8 9 9 a
+ paddd m2, m4
+ paddd m0, m2
+ pmaddwd m2, m10, m3 ; abcd2
+ pmaddwd m3, m8 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m11, m1 ; abcd3
+ pmaddwd m1, m9 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6*2+10]
+ paddd m3, m4
+ paddd m1, m3
+ pshufb m3, m2, m6 ; 8 9 9 a a b b c
+ pshufb m2, m7 ; a b b c c d d e
+ pmaddwd m3, m10 ; efgh2
+ pmaddwd m2, m11 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ mova [dstq+r6*2], m0
+ add r6, 8
+ jl .h_w8_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if WIN64
+ WIN64_SPILL_XMM 15
+%endif
+ movd m7, r8m
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ punpcklbw m3, m3
+ pshufb m7, [base+pw_256]
+ psraw m3, 8 ; sign-extend
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ cmp wd, 2
+ jne .v_w4
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ movd m2, [srcq+ssq*2]
+ add srcq, r6
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m6, [srcq+ssq*2]
+ add srcq, r6
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m4 ; 0 1
+ punpckldq m4, m2 ; 1 2
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m6 ; 4 5
+ punpckldq m6, m0 ; 5 6
+ punpcklwd m1, m4 ; 01 12
+ punpcklwd m2, m5 ; 23 34
+ punpcklwd m3, m6 ; 45 56
+ pxor m6, m6
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ punpckldq m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pavgw m5, m6
+ pminsw m5, m7
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcmp, srcq
+%endif
+ lea wd, [wq+hq-(1<<16)]
+%else
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+%endif
+.v_w4_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, r6
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, r6
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_w4_loop_start
+.v_w4_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_w4_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m3
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ pxor m2, m2
+ pmaxsw m1, m2
+ pavgw m1, m2
+ pminsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*29]
+ mov dstq, [esp+4*30]
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ sub wd, 1<<16
+%else
+.v_w4_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packssdw m12, m13
+ pxor m13, m13
+ pmaxsw m12, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .v_w4_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if ARCH_X86_32
+ movd m4, r8m
+ mova m6, [base+pd_512]
+ pshufb m4, [base+pw_256]
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ movd m15, r8m
+ pshufb m15, [base+pw_256]
+%endif
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ je .hv_w4
+ movq m0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m5, [base+spel_h_shuf2]
+ ALLOC_STACK -16*8
+%else
+ mova m6, [base+pd_512]
+ mova m9, [base+spel_h_shuf2]
+%endif
+ pshuflw m0, m0, q2121
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_w2_10bpc
+ psraw m7, 2
+ psllw m3, 2
+.hv_w2_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+%if ARCH_X86_32
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m9, m5
+ mova m11, m0
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+ mova m15, m4
+%else
+ pshufd m11, m3, q0000
+ pshufd m12, m3, q1111
+ pshufd m13, m3, q2222
+ pshufd m14, m3, q3333
+%endif
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m1, [srcq+ssq*2]
+ add srcq, r6
+ movu m4, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m2, m3, m1, m4
+%else
+ REPX {pshufb x, m9}, m2, m3, m1, m4
+%endif
+ REPX {pmaddwd x, m7}, m2, m3, m1, m4
+ phaddd m2, m3 ; 0 1
+ phaddd m1, m4 ; 2 3
+ movu m3, [srcq+ssq*1]
+ movu m4, [srcq+ssq*2]
+ add srcq, r6
+ movu m0, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m3, m4, m0
+%else
+ REPX {pshufb x, m9}, m3, m4, m0
+%endif
+ REPX {pmaddwd x, m7}, m3, m4, m0
+ phaddd m3, m4 ; 4 5
+ phaddd m0, m0 ; 6 6
+ REPX {paddd x, m6}, m2, m1, m3, m0
+ REPX {psrad x, 10}, m2, m1, m3, m0
+ packssdw m2, m1 ; 0 1 2 3
+ packssdw m3, m0 ; 4 5 6 _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ pshufd m5, m3, q0321 ; 5 6 _ _
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ punpcklwd m3, m5 ; 45 56
+.hv_w2_loop:
+ movu m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu m5, [srcq+ssq*0]
+ pshufb m4, m9
+ pshufb m5, m9
+ pmaddwd m4, m7
+ pmaddwd m5, m7
+ phaddd m4, m5
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ paddd m4, m6
+ psrad m4, 10 ; 7 8
+ packssdw m0, m4
+ pshufd m3, m0, q2103
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m5, m6
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ pxor m4, m4
+ pminsw m5, m15
+ pmaxsw m5, m4
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+.hv_w4:
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ ALLOC_STACK -16*15
+ mova m8, m0
+ mova m9, m1
+ mova m14, m6
+%else
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m3, 8
+ test dword r8m, 0x800
+ jz .hv_w4_10bpc
+ psraw m0, 2
+ psllw m3, 2
+.hv_w4_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+%if ARCH_X86_32
+ %define tmp esp+16*8
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcmp, srcq
+%endif
+ mova [tmp+16*5], m4
+ lea wd, [wq+hq-(1<<16)]
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-104 ; red zone
+%endif
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ mova [tmp+16*5], m15
+%endif
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
+ pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
+ pshufb m%1, m9 ; 2 3 3 4 4 5 5 6
+ pmaddwd m%3, m10
+ pmaddwd m%1, m11
+ paddd m%3, %5
+ paddd m%1, m%3
+ pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ pmaddwd m%3, m12
+ pmaddwd m%2, m13
+ paddd m%1, m%3
+ paddd m%1, m%2
+ psrad m%1, %4
+%endmacro
+.hv_w4_loop0:
+%if ARCH_X86_64
+ mova m14, [pd_512]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ movu m6, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 4, 1, 0, 10
+ PUT_8TAP_HV_H 5, 2, 0, 10
+ PUT_8TAP_HV_H 6, 3, 0, 10
+ movu m7, [srcq+ssq*0+0]
+ movu m2, [srcq+ssq*0+8]
+ movu m1, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ PUT_8TAP_HV_H 7, 2, 0, 10
+ PUT_8TAP_HV_H 1, 3, 0, 10
+ movu m2, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 2, 3, 0, 10
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 10
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_w4_loop_start
+.hv_w4_loop:
+ mova m1, [tmp+16*6]
+ mova m2, m15
+.hv_w4_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*6], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 9
+ psrad m2, 9
+ packssdw m1, m2
+ pxor m7, m7
+ pmaxsw m1, m7
+ pavgw m7, m1
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*61]
+ mov dstq, [esp+4*62]
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ movzx hd, ww
+ sub wd, 1<<16
+%else
+.hv_w4_loop:
+ mova m15, [tmp+16*1]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 9
+ psrad m15, 9
+ packssdw m14, m15
+ pxor m7, m7
+ pmaxsw m14, m7
+ pavgw m7, m14
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .hv_w4_loop0
+ RET
+%undef tmp
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 2, 1, 6, 4
+%elif WIN64
+DECLARE_REG_TMP 6, 4, 7, 4
+%else
+DECLARE_REG_TMP 6, 7, 7, 8
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r2b
+%define myd r2
+%define myq r2
+%else
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
+%endif
+%define base t2-prep_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, prep_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ test mxd, 0xf00
+ jnz .h
+ movifnidn hd, hm
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov myd, r7m ; bitdepth_max
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mova m5, [base+pw_8192]
+ shr myd, 11
+ add wq, t2
+ movddup m4, [base+prep_mul+myq*8]
+ movifnidn ssq, ssmp
+ movifnidn tmpq, tmpmp
+ lea r6, [ssq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ movifnidn ssq, r2mp
+ movifnidn hd, r4m
+ movddup m5, [base+prep_8tap_1d_rnd]
+ cmp wd, 4
+ jne .h_w8
+ movzx mxd, mxb
+ movq m0, [base+subpel_filters+mxq*8]
+ mova m3, [base+spel_h_shufA]
+ mova m4, [base+spel_h_shufB]
+ movifnidn tmpq, tmpmp
+ sub srcq, 2
+ WIN64_SPILL_XMM 8
+ punpcklbw m0, m0
+ psraw m0, 8
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw m0, 2
+.h_w4_12bpc:
+ pshufd m6, m0, q1111
+ pshufd m7, m0, q2222
+.h_w4_loop:
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ WIN64_SPILL_XMM 11
+ shr mxd, 16
+ movq m2, [base+subpel_filters+mxq*8]
+ mova m4, [base+spel_h_shufA]
+ mova m6, [base+spel_h_shufB]
+ movifnidn tmpq, r0mp
+ add wd, wd
+ punpcklbw m2, m2
+ add srcq, wq
+ psraw m2, 8
+ add tmpq, wq
+ neg wq
+ test dword r7m, 0x800
+ jnz .h_w8_12bpc
+ psllw m2, 2
+.h_w8_12bpc:
+ pshufd m7, m2, q0000
+%if ARCH_X86_32
+ ALLOC_STACK -16*3
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+%else
+ pshufd m8, m2, q1111
+ pshufd m9, m2, q2222
+ pshufd m10, m2, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6- 6]
+ movu m1, [srcq+r6+ 2]
+ pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m6 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m7 ; abcd0
+ pmaddwd m0, m8 ; abcd1
+ pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m6 ; 6 7 7 8 8 9 9 a
+ paddd m2, m5
+ paddd m0, m2
+ pmaddwd m2, m9, m3 ; abcd2
+ pmaddwd m3, m7 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m10, m1 ; abcd3
+ pmaddwd m1, m8 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6+10]
+ paddd m3, m5
+ paddd m1, m3
+ pshufb m3, m2, m4 ; a b b c c d d e
+ pshufb m2, m6 ; 8 9 9 a a b b c
+ pmaddwd m3, m9 ; efgh2
+ pmaddwd m2, m10 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq+r6], m0
+ add r6, 16
+ jl .h_w8_loop
+ add srcq, ssq
+ sub tmpq, wq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ WIN64_SPILL_XMM 15
+ movddup m7, [base+prep_8tap_1d_rnd]
+ movifnidn ssq, r2mp
+ movifnidn tmpq, r0mp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+.v_12bpc:
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_64
+ mov r7, tmpq
+%elif STACK_ALIGNMENT < 16
+ mov [esp+4*29], tmpq
+%endif
+ lea wd, [wq+hq-(1<<8)]
+.v_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m3, [srcq+ssq*0]
+ movq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m5, [srcq+ssq*0]
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_loop_start
+.v_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m7
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m7
+ paddd m2, m3
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*29]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*29], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.v_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m7
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m7
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ movq [tmpq+r6*0], m12
+ movhps [tmpq+r6*2], m12
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ movzx t3d, mxb
+ shr mxd, 16
+ cmp wd, 4
+ cmove mxd, t3d
+ movifnidn hd, r4m
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov ssq, r2mp
+ mov tmpq, r0mp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ mova m4, [base+prep_8tap_2d_rnd]
+ ALLOC_STACK -16*14
+ mova m8, m0
+ mova m9, m1
+ mova m14, m4
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m0, 4
+ psraw m3, 8
+ test dword r7m, 0x800
+ jz .hv_10bpc
+ psraw m0, 2
+.hv_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_32
+ %define tmp esp+16*8
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], tmpq
+%endif
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-88 ; red zone
+%endif
+ mov r7, tmpq
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+.hv_loop0:
+%if ARCH_X86_64
+ mova m14, [prep_8tap_2d_rnd]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m6, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 4, 1, 0, 6
+ PUT_8TAP_HV_H 5, 2, 0, 6
+ PUT_8TAP_HV_H 6, 3, 0, 6
+ movu m7, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m1, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 7, 2, 0, 6
+ PUT_8TAP_HV_H 1, 3, 0, 6
+ movu m2, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 2, 3, 0, 6
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 6
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_loop_start
+.hv_loop:
+ mova m1, [tmp+16*5]
+ mova m2, m15
+.hv_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*5], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m14
+ paddd m2, m14
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 6
+ psrad m2, 6
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*61]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*61], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.hv_loop:
+ mova m15, [tmp+16*1]
+ mova m7, [prep_8tap_2d_rnd]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ paddd m14, m7
+ paddd m15, m7
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 6
+ psrad m15, 6
+ packssdw m14, m15
+ movq [tmpq+r6*0], m14
+ movhps [tmpq+r6*2], m14
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .hv_loop0
+ RET
+%undef tmp
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %macro MC_4TAP_SCALED_H 1 ; dst_mem
+ movu m7, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m5, [r4 +ssq*0]
+ movu m6, [r4 +ssq*1]
+ lea srcq, [srcq+ssq*2]
+ lea r4, [r4 +ssq*2]
+ REPX {pshufb x, m12}, m7, m2
+ REPX {pmaddwd x, m13}, m7, m2
+ REPX {pshufb x, m14}, m5, m6
+ REPX {pmaddwd x, m15}, m5, m6
+ phaddd m7, m5
+ phaddd m2, m6
+ mova m5, [esp+0x00]
+ movd m6, [esp+0x10]
+ paddd m7, m5
+ paddd m2, m5
+ psrad m7, m6
+ psrad m2, m6
+ packssdw m7, m2
+ mova [stk+%1], m7
+ %endmacro
+%endif
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movu m%1, [srcq+ r4*2]
+ movu m%2, [srcq+ r6*2]
+ movu m%3, [srcq+ r7*2]
+ movu m%4, [srcq+ r9*2]
+ movu m%5, [srcq+r10*2]
+ movu m%6, [srcq+r11*2]
+ movu m%7, [srcq+r13*2]
+ movu m%8, [srcq+ rX*2]
+ add srcq, ssq
+ pmaddwd m%1, [stk+0x10]
+ pmaddwd m%2, [stk+0x20]
+ pmaddwd m%3, [stk+0x30]
+ pmaddwd m%4, [stk+0x40]
+ pmaddwd m%5, [stk+0x50]
+ pmaddwd m%6, [stk+0x60]
+ pmaddwd m%7, [stk+0x70]
+ pmaddwd m%8, [stk+0x80]
+ phaddd m%1, m%2
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, hround
+ paddd m%5, hround
+ psrad m%1, m12
+ psrad m%5, m12
+ packssdw m%1, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
+ %if %3 == 1
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ %endif
+ movu m0, [srcq+r0*2]
+ movu m1, [srcq+rX*2]
+ movu m2, [srcq+r4*2]
+ movu m3, [srcq+r5*2]
+ mov r0, [stk+16]
+ mov rX, [stk+20]
+ mov r4, [stk+24]
+ mov r5, [stk+28]
+ pmaddwd m0, [stk+%1+0x00]
+ pmaddwd m1, [stk+%1+0x10]
+ pmaddwd m2, [stk+%1+0x20]
+ pmaddwd m3, [stk+%1+0x30]
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m4, [srcq+r0*2]
+ movu m5, [srcq+rX*2]
+ movu m6, [srcq+r4*2]
+ movu m7, [srcq+r5*2]
+ add srcq, ssq
+ pmaddwd m4, [stk+%1+0xa0]
+ pmaddwd m5, [stk+%1+0xb0]
+ pmaddwd m6, [stk+%1+0xc0]
+ pmaddwd m7, [stk+%1+0xd0]
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m0, m2
+ phaddd m4, m6
+ paddd m0, hround
+ paddd m4, hround
+ psrad m0, m12
+ psrad m4, m12
+ packssdw m0, m4
+ %if %2 != 0
+ mova [stk+%2], m0
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %endif
+ %xdefine base_reg r12
+%else ; prep
+ %assign isput 0
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [stk+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %define tmp_stridem dword [stk+0x138]
+ %endif
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if isput && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x21c]
+ %define dym [esp+0x220]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isput
+ mov r3, pxmaxm
+ %define pxmaxm r3
+ %else
+ mov r2, pxmaxm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+%endif
+ LEA base_reg, %1_8tap_scaled_16bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_64
+ %if isput
+ mov r7d, pxmaxm
+ %endif
+%else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+%if isput
+ movd m15, pxmaxm
+%endif
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isput
+ pshuflw m15, m15, q0000
+ punpcklqdq m15, m15
+%endif
+%if isprep
+ %if UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ %endif
+ %if ARCH_X86_64
+ mov r6d, pxmaxm
+ %endif
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %else
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %xdefine hm r7m
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ shr r7d, 11
+ mova m10, [base+pd_0x3ff]
+ movddup m11, [base+s_8tap_h_rnd+r7*8]
+ movd m12, [base+s_8tap_h_sh+r7*4]
+ %if isput
+ movddup m13, [base+put_s_8tap_v_rnd+r7*8]
+ movd m7, [base+put_s_8tap_v_sh+r7*4]
+ %define pxmaxm [rsp]
+ mova pxmaxm, m15
+ punpcklqdq m12, m7
+ %endif
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ shr r3, 11
+ movddup m1, [base+s_8tap_h_rnd+r3*8]
+ movd m2, [base+s_8tap_h_sh+r3*4]
+ %if isput
+ %define m13 [esp+0x20]
+ %define pxmaxm [esp+0x30]
+ %define stk esp+0x40
+ movddup m5, [base+put_s_8tap_v_rnd+r3*8]
+ movd m6, [base+put_s_8tap_v_sh+r3*4]
+ mova pxmaxm, m15
+ punpcklqdq m2, m6
+ mova m13, m5
+ %else
+ %define m13 [base+pd_m524256]
+ %endif
+ mov ssd, ssm
+ mova m11, m1
+ mova m12, m2
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssd*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ %if isprep
+ mov r1, r1m
+ %endif
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6, m7
+ REPX {pmaddwd x, m15}, m4, m5, m6, m7
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m7
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6 7
+ SWAP m1, m4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m14}, m1, m7, m6, m3
+ REPX {pmaddwd x, m15}, m1, m7, m6, m3
+ phaddd m1, m7
+ phaddd m6, m3
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pmaddwd m5, m3, m7
+ pmaddwd m6, m0, m8
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m7, m2, m9
+ pmaddwd m8, m4, m10
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ mov r5, myd
+ shr r5, 6
+ lea r1, [r1+r5]
+ mov r5, 64 << 24
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ cmovnz r5, [base+subpel_filters+r1*8+0]
+ movd m6, r3
+ movd m7, r5
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %define m8 m3
+ %endif
+ paddd m5, m13
+ pshufd m6, m12, q1032
+ pxor m8, m8
+ paddd m5, m7
+ psrad m5, m6
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, pxmaxm
+ movd [dstq], m5
+ add dstq, dsmp
+ dec hd
+ jz .ret
+ %if ARCH_X86_64
+ add myd, dyd
+ %else
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [stk+0x20]
+ mova m0, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m4, [stk+0x50]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movu m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddwd m5, m15
+ phaddd m5, m5
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pshufb m6, m14
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ phaddd m5, m6
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5 ; 6 7 6 7
+ punpckhqdq m1, m5 ; 4 5 6 7
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%else
+ %define m9 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x20], m13
+ mova [stk+0x30], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m4, [srcq+ss3q ]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ movu m11, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m2, m3, m4
+ REPX {pmaddwd x, m13}, m1, m2, m3, m4
+ REPX {pshufb x, m14}, m0, m9, m10, m11
+ REPX {pmaddwd x, m15}, m0, m9, m10, m11
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ phaddd m4, m11
+ REPX {paddd x, m5}, m1, m2, m3, m4
+ REPX {psrad x, xm6}, m1, m2, m3, m4
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m4 ; 6 7
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ pshufd m10, m3, q1032 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m10 ; 67
+ mova [rsp+0x40], m7
+ mova [rsp+0x50], m8
+ mova [rsp+0x60], m9
+%else
+ mova [stk+0x00], m12
+ mova [stk+0x10], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x40 ; 0 1
+ MC_4TAP_SCALED_H 0x50 ; 2 3
+ MC_4TAP_SCALED_H 0x60 ; 4 5
+ MC_4TAP_SCALED_H 0x70 ; 6 7
+ mova m4, [stk+0x40]
+ mova m5, [stk+0x50]
+ mova m6, [stk+0x60]
+ mova m7, [stk+0x70]
+ mov [stk+0xc0], r4
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ pshufd m0, m7, q1032 ; 7 _
+ mova [stk+0xb0], m0
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ punpcklwd m3, m7, [stk+0xb0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x40], m0 ; 01
+ mova [stk+0x50], m1 ; 23
+ mova [stk+0x60], m2 ; 45
+ mova [stk+0x70], m3 ; 67
+ mova [stk+0x80], m4 ; 12
+ mova [stk+0x90], m5 ; 34
+ mova [stk+0xa0], m6 ; 56
+ %define m12 [stk+0x00]
+ %define m14 [stk+0x10]
+ %define m13 [stk+0x20]
+ %define m15 [stk+0x30]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq m9, r11q
+ punpcklbw m9, m9
+ psraw m9, 8
+ pshufd m7, m9, q0000
+ pshufd m8, m9, q1111
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufd m7, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m7
+ pmaddwd m8, m3, m9
+ %if isput
+ movd m9, [rsp+0x28]
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ paddd m4, m5
+ paddd m6, m8
+ paddd m4, m6
+ paddd m4, vrnd_mem
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ %if isput
+ movd m4, [esp+0x18]
+ %endif
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, vrnd_mem
+ paddd m0, m2
+ SWAP m4, m0
+ %define m9 m0
+%endif
+%if isput
+ pxor m5, m5
+ psrad m4, m9
+ packssdw m4, m4
+ pmaxsw m4, m5
+ pminsw m4, pxmaxm
+ movq [dstq], m4
+ add dstq, dsmp
+%else
+ psrad m4, 6
+ packssdw m4, m4
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova m8, [rsp+0x10]
+ movd m9, [rsp+0x20]
+ movu m4, [srcq]
+ movu m5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova m0, [rsp+0x40]
+ mova [rsp+0x40], m1
+ mova m1, [rsp+0x50]
+ mova [rsp+0x50], m2
+ mova m2, [rsp+0x60]
+ mova [rsp+0x60], m3
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, m8
+ psrad m4, m9
+ packssdw m4, m4
+ punpcklwd m3, m10, m4
+ mova m10, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [srcq+r6]
+ mova m0, [rsp+0x50]
+ mova m11, [rsp+0x60]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [rsp+0x40], m0
+ mova [rsp+0x50], m11
+ phaddd m4, m5
+ phaddd m6, m7
+ paddd m4, m8
+ paddd m6, m8
+ psrad m4, m9
+ psrad m6, m9
+ packssdw m4, m6
+ punpcklwd m9, m10, m4
+ mova [rsp+0x60], m9
+ pshufd m10, m4, q1032
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [stk+0x40]
+ mova m1, [stk+0x50]
+ mova m2, [stk+0x60]
+ mova m3, [stk+0x70]
+ jmp .w4_loop
+.w4_next_line:
+ mov r5, [stk+0xc0]
+ movu m4, [srcq]
+ movu m5, [r5]
+ test myd, 0x400
+ jz .w4_skip_line
+ add [stk+0xc0], ssq
+ mova m0, [stk+0x80]
+ mova m3, [stk+0x50]
+ mova [stk+0x40], m0
+ mova [stk+0x80], m3
+ mova m1, [stk+0x90]
+ mova m6, [stk+0x60]
+ mova [stk+0x50], m1
+ mova [stk+0x90], m6
+ mova m2, [stk+0xa0]
+ mova m7, [stk+0x70]
+ mova [stk+0x60], m2
+ mova [stk+0xa0], m7
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, hrnd_mem
+ psrad m4, hsh_mem
+ packssdw m4, m4
+ punpcklwd m3, [stk+0xb0], m4
+ mova [stk+0xb0], m4
+ mova [stk+0x70], m3
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [r5 +ssq*1]
+ lea r5, [r5 +ssq*2]
+ mov [stk+0xc0], r5
+ mova m0, [stk+0x50]
+ mova m1, [stk+0x60]
+ mova m2, [stk+0x70]
+ mova m3, [stk+0x90]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [stk+0x40], m0
+ mova [stk+0x50], m1
+ mova [stk+0x60], m2
+ mova [stk+0x80], m3
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [stk+0xa0]
+ mova m7, [stk+0xb0]
+ paddd m4, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m4, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m4, m6
+ punpcklwd m7, m4
+ pshufd m6, m4, q1032
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m6
+ punpcklwd m3, m4, m6
+ mova [stk+0x70], m3
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%endif
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ %define hround m11
+ shr t0d, 16
+ movd m15, t0d
+ %if isprep
+ mova m13, [base+pd_m524256]
+ %endif
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [stk+0x0f4], myd
+ mov [stk+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov myd, [stk+0x0f4]
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m10
+ pmaddwd m7, [stk+0xa0], m10
+ pmaddwd m8, [stk+0xb0], m11
+ pmaddwd m9, [stk+0xc0], m11
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [stk+0x140], myd
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m10, [srcq+r13*2]
+ movu m11, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m10, [stk+0x70]
+ pmaddwd m11, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m10, m11
+ mova m11, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m10
+ phaddd m4, m6
+ paddd m4, m11
+ paddd m8, m11
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m5, [stk+0x90], m14 ; 4a 5a
+ pshufb m6, [stk+0xa0], m14 ; 4b 5b
+ pshufb m7, [stk+0xb0], m15 ; 7a 6a
+ pshufb m8, [stk+0xc0], m15 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m8
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [stk+0x20]
+ mova m1, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m3, [stk+0x50]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ jz .skip_line
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ punpcklwd m5, m6
+ mov myd, mym
+ mova [stk+0x90], m5
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mov myd, mym
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m13 [esp+0x20]
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ mov r1, r1m
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6
+ SWAP m1, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ REPX {pshufb x, m14}, m1, m7, m6
+ REPX {pmaddwd x, m15}, m1, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m1, m7
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m4, m1, q2121 ; 5 6 5 6
+ punpcklwd m2, m1, m4 ; 45 56
+ %if ARCH_X86_32
+ mov r0, r0m
+ %endif
+.dy1_w2_loop:
+ movu m1, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m7
+ mova m3, m0
+ pmaddwd m0, m8
+ pshufb m1, m14
+ pshufb m6, m14
+ pmaddwd m1, m15
+ pmaddwd m6, m15
+ phaddd m1, m6
+ paddd m1, m11
+ psrad m1, m12
+ packssdw m1, m1
+ paddd m5, m0
+ mova m0, m2
+ pmaddwd m2, m9
+ paddd m5, m2
+ palignr m2, m1, m4, 12
+ punpcklwd m2, m1 ; 67 78
+ pmaddwd m4, m2, m10
+ paddd m5, m13
+ paddd m5, m4
+ pxor m6, m6
+ mova m4, m1
+ pshufd m1, m12, q1032
+ psrad m5, m1
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ %if isprep
+ mov r3, r3m
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m3, m2, m4
+ REPX {pmaddwd x, m15}, m1, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ add srcq, ss3q
+ REPX {pshufb x, m12}, m1, m2, m3
+ REPX {pmaddwd x, m13}, m1, m2, m3
+ REPX {pshufb x, m14}, m0, m9, m10
+ REPX {pmaddwd x, m15}, m0, m9, m10
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m1, m2, m3
+ REPX {psrad x, xm6}, m1, m2, m3
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m3 ; 6 6
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ movq m10, r13
+ mova [stk+0x00], m1
+ mova [stk+0x10], m8
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+ mova [stk+0x40], m3
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ movu m7, [srcq]
+ movu m2, [r4]
+ add srcq, ssq
+ add r4, ssq
+ mov [stk+0xb0], r4
+ pshufb m7, m12
+ pshufb m2, m14
+ pmaddwd m7, m13
+ pmaddwd m2, m15
+ phaddd m7, m2
+ paddd m7, [esp+0x00]
+ psrad m7, [esp+0x10]
+ packssdw m7, m7 ; 6 6
+ mova m4, [stk+0x60]
+ mova m5, [stk+0x70]
+ mova m6, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ mova [stk+0xa0], m7
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ movd m7, r4
+ movd m3, r5
+ mov r0, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xb0]
+ mova [stk+0xc0], m4 ; 12
+ mova [stk+0x60], m1 ; 23
+ mova [stk+0x70], m2 ; 45
+ mova [stk+0x80], m5 ; 34
+ mova [stk+0x90], m6 ; 56
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m3
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+ mova m7, [stk+0xc0]
+ mova m8, [stk+0x80]
+%endif
+.dy1_w4_loop:
+ movu m11, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ pmaddwd m0, m3
+ pmaddwd m7, m3
+ pmaddwd m1, m4
+ pmaddwd m8, m4
+ pmaddwd m2, m5
+ pmaddwd m9, m5
+ paddd m1, m0
+ paddd m8, m7
+%if ARCH_X86_64
+ movu m0, [srcq+r4]
+ movu m7, [srcq+r6]
+%else
+ movu m0, [r4+ssq*0]
+ movu m7, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+%endif
+ lea srcq, [srcq+ssq*2]
+ paddd m1, m2
+ paddd m8, m9
+ pshufb m11, m12
+ pshufb m6, m12
+ pmaddwd m11, m13
+ pmaddwd m6, m13
+ pshufb m0, m14
+ pshufb m7, m14
+ pmaddwd m0, m15
+ pmaddwd m7, m15
+ phaddd m11, m0
+ phaddd m6, m7
+ paddd m11, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m11, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m11, m6 ; 7 8
+%if ARCH_X86_64
+ shufps m9, [stk+0x40], m11, q1032 ; 6 7
+ mova m0, [stk+0x00]
+ mova [stk+0x40], m11
+%else
+ shufps m9, [stk+0xa0], m11, q1032 ; 6 7
+ mova m0, [stk+0x60]
+ mova [stk+0xa0], m11
+%endif
+ punpcklwd m2, m9, m11 ; 67
+ punpckhwd m9, m11 ; 78
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m9, m10
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m1, vrnd_mem
+ paddd m8, vrnd_mem
+ paddd m1, m6
+ paddd m8, m7
+%if ARCH_X86_64
+ mova m7, [stk+0x10]
+%else
+ mova m7, [stk+0x80]
+%endif
+%if isput
+ psrad m1, m11
+ psrad m8, m11
+%else
+ psrad m1, 6
+ psrad m8, 6
+%endif
+ packssdw m1, m8
+%if ARCH_X86_64
+ mova m8, [stk+0x30]
+%else
+ mova m8, [stk+0x90]
+%endif
+%if isput
+ pxor m6, m6
+ pmaxsw m1, m6
+ pminsw m1, pxmaxm
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m1
+ add tmpq, 16
+%endif
+%if ARCH_X86_64
+ mova m1, [stk+0x20]
+ mova [stk+0x10], m8
+ mova [stk+0x00], m1
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+%else
+ mova m1, [stk+0x70]
+ mova [stk+0x80], m8
+ mova [stk+0x60], m1
+ mova [stk+0x70], m2
+ mova [stk+0x90], m9
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy1_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy1_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m12, [srcq+r13*2]
+ movu m13, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m12, [stk+0x70]
+ pmaddwd m13, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m12, m13
+ mova m9, [base+unpckw]
+ mova m13, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m12
+ phaddd m4, m6
+ pshufd m5, m9, q1032
+ pshufb m0, m9 ; 0a 1a
+ pshufb m1, m9 ; 0b 1b
+ pshufb m2, m5 ; 3a 2a
+ pshufb m3, m5 ; 3b 2b
+ mova m12, shift
+ paddd m4, m13
+ paddd m8, m13
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m6, [stk+0x90], m9 ; 4a 5a
+ pshufb m7, [stk+0xa0], m9 ; 4b 5b
+ pshufb m8, [stk+0xb0], m5 ; 7a 6a
+ pshufb m13, [stk+0xc0], m5 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m7 ; 34b
+ punpckhwd m6, m8 ; 56a
+ punpckhwd m7, m13 ; 56b
+ punpcklwd m8, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m13, m4 ; 78b
+ mova [stk+0x90], m6
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m8
+ mova [stk+0xc0], m13
+ mova m13, vround
+%else
+ mov r0m, r0
+ mov r3, r3m
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ mova m4, [stk+0x180]
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ mova m7, [stk+0x1b0]
+ punpcklwd m5, m6
+ mova m6, [stk+0x1a0]
+ mova [stk+0x90], m5
+ mova m5, [stk+0x190]
+ mov r0, r0m
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m13
+ %define vrnd_mem [rsp+0x10]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define vrnd_mem [esp+0x20]
+ mov r1, r1m
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+ movu m2, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*1]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m1, m2
+ phaddd m4, m5
+ phaddd m5, m6
+ REPX {paddd x, m11}, m0, m1, m4, m5
+ REPX {psrad x, m12}, m0, m1, m4, m5
+ packssdw m0, m1 ; 0 2 2 4
+ packssdw m4, m5 ; 1 3 3 5
+ SWAP m2, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m1, m2
+ movu m2, [srcq+ssq*1]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m14}, m2, m7, m6
+ REPX {pmaddwd x, m15}, m2, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m2, m7
+ phaddd m7, m6
+ REPX {paddd x, m11}, m0, m1, m2, m7
+ REPX {psrad x, m12}, m0, m1, m2, m7
+ packssdw m0, m1
+ packssdw m2, m7
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %xdefine m13 m7
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ punpcklwd m1, m0, m2 ; 01 23
+ punpckhwd m3, m0, m2 ; 23 45
+ %if ARCH_X86_32
+ mov r4, r0m
+ %define dstq r4
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ %endif
+.dy2_w2_loop:
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m13, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m3, m8
+ REPX {pshufb x, m14}, m4, m5, m6, m13
+ REPX {pmaddwd x, m15}, m4, m5, m6, m13
+ phaddd m4, m5
+ phaddd m6, m13
+ pmaddwd m5, m1, m7
+ paddd m4, m11
+ paddd m6, m11
+ psrad m4, m12
+ psrad m6, m12
+ packssdw m4, m6 ; 6 7 8 9
+ paddd m5, m3
+ pshufd m3, m4, q2200
+ pshufd m4, m4, q3311
+ palignr m3, m0, 12 ; 4 6 6 8
+ palignr m4, m2, 12 ; 5 7 7 9
+ mova m0, m3
+ mova m2, m4
+ punpcklwd m1, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m6, m1, m9
+ pmaddwd m4, m3, m10
+ paddd m5, vrnd_mem
+ paddd m6, m4
+ paddd m5, m6
+ pshufd m4, m12, q1032
+ pxor m6, m6
+ psrad m5, m4
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r1, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r3, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m1, [srcq+ssq*0]
+ movu m8, [srcq+ssq*2]
+ movu m9, [srcq+ssq*1]
+ movu m10, [srcq+ss3q ]
+ movu m7, [srcq+r4 ]
+ movu m2, [srcq+r11 ]
+ movu m3, [srcq+r6 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m9, m8, m10
+ REPX {pmaddwd x, m13}, m1, m9, m8, m10
+ REPX {pshufb x, m14}, m7, m3, m2, m4
+ REPX {pmaddwd x, m15}, m7, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m1, m7
+ phaddd m8, m2
+ phaddd m9, m3
+ phaddd m10, m4
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ REPX {paddd x, m5}, m1, m9, m8, m10
+ REPX {psrad x, xm6}, m1, m9, m8, m10
+ packssdw m1, m8 ; 0 2
+ packssdw m9, m10 ; 1 3
+ movu m0, [srcq+r4 ]
+ movu m8, [srcq+r6 ]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m12}, m2, m3
+ REPX {pmaddwd x, m13}, m2, m3
+ REPX {pshufb x, m14}, m0, m8
+ REPX {pmaddwd x, m15}, m0, m8
+ phaddd m2, m0
+ phaddd m3, m8
+ shr myd, 6
+ mov r9d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r9q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m2, m3
+ REPX {psrad x, xm6}, m2, m3
+ packssdw m2, m3 ; 4 5
+ pshufd m3, m2, q1032 ; 5 _
+ punpcklwd m0, m1, m9 ; 01
+ punpckhwd m1, m9 ; 23
+ punpcklwd m2, m3 ; 45
+ movq m10, r9
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ mov [stk+0xe0], r4
+ mova m3, [base+spel_s_shuf8]
+ mova m0, [stk+0x60]
+ mova m1, [stk+0x70]
+ mova m2, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ pshufb m0, m3 ; 01
+ pshufb m1, m3 ; 23
+ pshufb m2, m3 ; 45
+ movd m7, r4
+ movd m4, r5
+ mov r5, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xe0]
+ %define dstq r5
+ %define tmpq r5
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m4
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+%endif
+.dy2_w4_loop:
+ pmaddwd m8, m0, m3
+ pmaddwd m9, m1, m3
+ mova m0, m2
+ pmaddwd m1, m4
+ pmaddwd m11, m2, m4
+ paddd m8, vrnd_mem
+ paddd m9, vrnd_mem
+ pmaddwd m2, m5
+ paddd m8, m1
+ paddd m9, m11
+ paddd m8, m2
+ movu m6, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+%if ARCH_X86_64
+ movu m11, [srcq+r4 ]
+ movu m2, [srcq+r11]
+%else
+ movu m11, [r4+ssq*0]
+ movu m2, [r4+ssq*2]
+%endif
+ pshufb m6, m12
+ pshufb m1, m12
+ pmaddwd m6, m13
+ pmaddwd m1, m13
+ pshufb m11, m14
+ pshufb m2, m14
+ pmaddwd m11, m15
+ pmaddwd m2, m15
+ phaddd m6, m11
+ phaddd m1, m2
+ paddd m6, hrnd_mem
+ paddd m1, hrnd_mem
+ psrad m6, hsh_mem
+ psrad m1, hsh_mem
+ movu m7, [srcq+ssq*1]
+ movu m11, [srcq+ss3q ]
+ packssdw m6, m1 ; 6 8
+%if ARCH_X86_64
+ movu m2, [srcq+r6 ]
+ movu m1, [srcq+r13]
+%else
+ movu m2, [r4+ssq*1]
+ movu m1, [r4+ss3q ]
+%endif
+ pshufb m7, m12
+ pshufb m11, m12
+ pmaddwd m7, m13
+ pmaddwd m11, m13
+ pshufb m2, m14
+ pshufb m1, m14
+ pmaddwd m2, m15
+ pmaddwd m1, m15
+ phaddd m7, m2
+ phaddd m11, m1
+ paddd m7, hrnd_mem
+ paddd m11, hrnd_mem
+ psrad m7, hsh_mem
+ psrad m11, hsh_mem
+ packssdw m7, m11 ; 7 9
+%if ARCH_X86_32
+ lea r4, [r4+ssq*4]
+%endif
+ lea srcq, [srcq+ssq*4]
+ punpcklwd m1, m6, m7 ; 67
+ punpckhwd m6, m7 ; 89
+ mova m2, m6
+ pmaddwd m11, m1, m5
+ pmaddwd m7, m1, m10
+ pmaddwd m6, m10
+ paddd m9, m11
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m8, m7
+ paddd m9, m6
+%if isput
+ psrad m8, m11
+ psrad m9, m11
+ packssdw m8, m9
+ pxor m7, m7
+ pmaxsw m8, m7
+ pminsw m8, pxmaxm
+ movq [dstq+dsq*0], m8
+ movhps [dstq+dsq*1], m8
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m8, 6
+ psrad m9, 6
+ packssdw m8, m9
+ mova [tmpq], m8
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isput
+ %define dstq r0
+ %else
+ %define tmpq r0
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy2_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy2_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
+ mova [stk+0xd0], m4
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
+ mova m4, [stk+0xd0]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov r3, r3m
+ MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mov r0, r0m
+%endif
+ jmp .dy2_vloop
+INIT_XMM ssse3
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 6
+%else
+DECLARE_REG_TMP 2
+%endif
+
+%if ARCH_X86_64
+; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
+; by allocating 16 bytes more stack space so that stack offsets match up.
+%if WIN64 && STACK_ALIGNMENT == 16
+%assign stksz 16*14
+%else
+%assign stksz 16*13
+%endif
+cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+%assign stack_size_padded_8x8t stack_size_padded
+%else
+cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%define m8 [esp+16*13]
+%define m9 [esp+16*14]
+%define cntd dword [esp+4*63]
+%define dstq tmpq
+%define dsq 0
+%if STACK_ALIGNMENT < 16
+%define dstm [esp+4*65]
+%define dsm [esp+4*66]
+%else
+%define dstm r0m
+%define dsm r1m
+%endif
+%endif
+%define base filterq-$$
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8t_rnd]
+%else
+ movddup m1, [base+warp8x8t_rnd]
+ mov r1, r1m
+ add r1, r1
+ mova m8, m1
+ mov r1m, r1 ; ds *= 2
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*4]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*0], m1
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*2], m1
+ dec cntd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+ASSERT stack_size_padded == stack_size_padded_8x8t
+%else
+cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%endif
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8_rnd2+t0*8]
+ movd m9, r7m ; pixel_max
+ pshufb m9, [base+pw_256]
+%else
+ movddup m1, [base+warp8x8_rnd2+t0*8]
+ movd m2, r7m ; pixel_max
+ pshufb m2, [base+pw_256]
+ mova m8, m1
+ mova m9, m2
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*2]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call .main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*0], m1
+ call .main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*1], m1
+ dec cntd
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov deltaq, r5m
+ mov mxd, r6m
+%endif
+ movd m0, [base+warp8x8_shift+t0*4]
+ movddup m7, [base+warp8x8_rnd1+t0*8]
+ add filterq, mc_warp_filter-$$
+%if ARCH_X86_64
+ movsx alphad, word [deltaq+2*0]
+ movsx betad, word [deltaq+2*1]
+ movsx gammad, word [deltaq+2*2]
+ movsx deltad, word [deltaq+2*3]
+ lea tmpq, [ssq*3]
+ add mxd, 512+(64<<10)
+ sub srcq, tmpq ; src -= ss*3
+ imul tmpd, alphad, -7
+ mov myd, r7m
+ add betad, tmpd ; beta -= alpha*7
+ imul tmpd, gammad, -7
+ add myd, 512+(64<<10)
+ mov cntd, 4
+ add deltad, tmpd ; delta -= gamma*7
+%else
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset - gprsize
+%endif
+ mov r3d, r5m ; abcd
+%if STACK_ALIGNMENT < 16
+ mov r0, r1m ; dst
+ mov r1, r2m ; ds
+ mov [esp+gprsize+4*65], r0
+ mov [esp+gprsize+4*66], r1
+%endif
+ movsx alphad, word [r3+2*0]
+ movsx r2d, word [r3+2*1]
+ movsx gammad, word [r3+2*2]
+ movsx r3d, word [r3+2*3]
+ imul r5d, alphad, -7
+ add r2d, r5d ; beta -= alpha*7
+ imul r5d, gammad, -7
+ mov [esp+gprsize+4*60], r2d
+ add r3d, r5d ; delta -= gamma*7
+ mov [esp+gprsize+4*61], r3d
+ mov r3d, r4m ; ss
+ mov srcq, r3m
+ mov mxd, r6m
+ mov myd, r7m
+ mov dword [esp+gprsize+4*63], 4 ; cnt
+ mov [esp+gprsize+4*62], r3
+ lea r3, [r3*3]
+ add mxd, 512+(64<<10)
+ add myd, 512+(64<<10)
+ sub srcq, r3 ; src -= ss*3
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset + gprsize
+%endif
+%endif
+ mova [rsp+gprsize], m0
+ pxor m6, m6
+ call .h
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 01
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 1], m1
+ mova [rsp+gprsize+16* 4], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 12
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 7], m1
+ mova [rsp+gprsize+16*10], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 23
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 2], m1
+ mova [rsp+gprsize+16* 5], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 34
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 8], m1
+ mova [rsp+gprsize+16*11], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 45
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 3], m1
+ mova [rsp+gprsize+16* 6], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 56
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 9], m1
+ mova [rsp+gprsize+16*12], m5
+ mova m5, m0
+.main2:
+ call .h
+%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m4, [filterq+myq*8] ; a
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m2, [filterq+tmpq*8] ; b
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m3, [filterq+myq*8] ; c
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m1, [filterq+tmpq*8] ; d
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ punpcklwd m4, m2
+ punpcklwd m3, m1
+ punpckldq m2, m4, m3
+ punpckhdq m4, m3
+ punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ pmaddwd m1, [rsp+gprsize+16*%1]
+ punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ mova m2, [rsp+gprsize+16*%2]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%1], m2
+ paddd m1, m3
+ punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ mova m2, [rsp+gprsize+16*%3]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%2], m2
+ paddd m1, m3
+ punpcklwd m3, m5, m0 ; 67
+ punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m2, m3
+ mova [rsp+gprsize+16*%3], m3
+ paddd m1, m2
+ movq m4, [filterq+myq*8] ; e
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8] ; f
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m2, [filterq+myq*8] ; g
+%if ARCH_X86_64
+ lea myd, [tmpq+deltaq] ; my += delta
+%else
+ mov myd, [esp+gprsize+4*61]
+ add myd, tmpd
+%endif
+ shr tmpd, 10
+ punpcklwd m4, m3
+ movq m3, [filterq+tmpq*8] ; h
+ punpcklwd m2, m3
+ punpckldq m3, m4, m2
+ punpckhdq m4, m2
+ punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
+ pmaddwd m2, [rsp+gprsize+16*%4]
+ punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
+ mova m3, [rsp+gprsize+16*%5]
+ pmaddwd m6, m3
+ mova [rsp+gprsize+16*%4], m3
+ pxor m3, m3
+ paddd m2, m6
+ punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
+ mova m6, [rsp+gprsize+16*%6]
+ pmaddwd m3, m6
+ mova [rsp+gprsize+16*%5], m6
+ punpckhwd m5, m0
+ pxor m6, m6
+ paddd m2, m3
+ punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
+ pmaddwd m3, m5
+ mova [rsp+gprsize+16*%6], m5
+ mova m5, m0
+ paddd m2, m3
+%endmacro
+ WARP_V 1, 2, 3, 4, 5, 6
+ ret
+.main3:
+ call .h
+ WARP_V 7, 8, 9, 10, 11, 12
+ ret
+ALIGN function_align
+.h:
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ punpcklbw m0, m6, m3
+ movu m3, [srcq-6]
+ pmaddwd m0, m3 ; 0
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-4]
+ pmaddwd m2, m3 ; 1
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m0, m2 ; 0 1
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-2]
+ pmaddwd m2, m3 ; 2
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+0]
+ pmaddwd m1, m3 ; 3
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m2, m1 ; 2 3
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+2]
+ pmaddwd m1, m3 ; 4
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ phaddd m0, m2 ; 0 1 2 3
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+4]
+ pmaddwd m2, m3 ; 5
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m1, m2 ; 4 5
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+6]
+ pmaddwd m2, m3 ; 6
+%if ARCH_X86_64
+ lea mxd, [tmpq+betaq] ; mx += beta
+%else
+ mov mxd, [esp+gprsize*2+4*60]
+ add mxd, tmpd
+%endif
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m4, m6, m3
+ movu m3, [srcq+8]
+%if ARCH_X86_64
+ add srcq, ssq
+%else
+ add srcq, [esp+gprsize*2+4*62]
+%endif
+ pmaddwd m3, m4 ; 7
+ phaddd m2, m3 ; 6 7
+ phaddd m1, m2 ; 4 5 6 7
+ paddd m0, m7
+ paddd m1, m7
+ psrad m0, [rsp+gprsize*2]
+ psrad m1, [rsp+gprsize*2]
+ packssdw m0, m1
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.ret:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jne .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add dstq, strideq
+.w16:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
+%define base r6-avg_ssse3_table
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ movddup m2, [base+bidir_rnd+t0*8]
+ movddup m3, [base+bidir_mul+t0*8]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+16*0]
+ paddsw m0, [tmp2q+16*0]
+ mova m1, [tmp1q+16*1]
+ paddsw m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ pmulhw m0, m3
+ pmulhw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
+%define base r6-w_avg_ssse3_table
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ movd m6, r7m ; pixel_max
+ movddup m5, [base+pd_65538]
+ movsxd wq, [r6+wq*4]
+ pshufb m6, [base+pw_256]
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ paddw m5, m6
+ mov r6d, t0d
+ shl t0d, 2
+ test dword r7m, 0x800
+ cmovnz r6d, t0d
+ movifnidn hd, hm
+ movd m4, r6d
+ pslld m5, 7
+ pxor m7, m7
+ pshufd m4, m4, q0000
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m2, [tmp1q+16*0]
+ mova m0, [tmp2q+16*0]
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ mova m2, [tmp1q+16*1]
+ mova m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaddwd m3, m4
+ pmaddwd m0, m4
+ paddd m3, m5
+ paddd m0, m5
+ psrad m3, 8
+ psrad m0, 8
+ packssdw m0, m3
+ punpckhwd m3, m1, m2
+ punpcklwd m1, m2
+ pmaddwd m3, m4
+ pmaddwd m1, m4
+ paddd m3, m5
+ paddd m1, m5
+ psrad m3, 8
+ psrad m1, 8
+ packssdw m1, m3
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+ ret
+
+%if ARCH_X86_64
+cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%else
+cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+%define hd dword r5m
+%define m8 [base+pw_64]
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ movddup m6, [base+bidir_rnd+t0*8]
+ movddup m7, [base+bidir_mul+t0*8]
+%if ARCH_X86_64
+ mova m8, [base+pw_64]
+ movifnidn hd, hm
+%endif
+ add wq, r6
+ mov maskq, r6mp
+ BIDIR_FN
+ALIGN function_align
+.main:
+ movq m3, [maskq+8*0]
+ mova m0, [tmp1q+16*0]
+ mova m4, [tmp2q+16*0]
+ pxor m5, m5
+ punpcklbw m3, m5
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ psubw m1, m8, m3
+ punpckhwd m4, m3, m1 ; m, 64-m
+ punpcklwd m3, m1
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m0, m3
+ movq m3, [maskq+8*1]
+ mova m1, [tmp1q+16*1]
+ mova m4, [tmp2q+16*1]
+ add maskq, 8*2
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ psrad m2, 5
+ psrad m0, 5
+ packssdw m0, m2
+ punpcklbw m3, m5
+ punpckhwd m2, m1, m4
+ punpcklwd m1, m4
+ psubw m5, m8, m3
+ punpckhwd m4, m3, m5 ; m, 64-m
+ punpcklwd m3, m5
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m1, m3
+ psrad m2, 5
+ psrad m1, 5
+ packssdw m1, m2
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ psubsw m0, m6
+ psubsw m1, m6
+ pmulhw m0, m7
+ pmulhw m1, m7
+ ret
+
+cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m0, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+ %define m8 [rsp+gprsize+16*0]
+ %define m9 [rsp+gprsize+16*1]
+ %define m10 [rsp+gprsize+16*2]
+ %define m11 [rsp+gprsize+16*3]
+%endif
+ movd m7, [base+pw_2]
+ psubw m7, m0
+ pshufb m7, [base+pw_256]
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w4:
+ movq [dstq+strideq*0], m0
+ phaddw m2, m3
+ movhps [dstq+strideq*1], m0
+ phaddd m2, m2
+ lea dstq, [dstq+strideq*2]
+ paddw m2, m7
+ movq [dstq+strideq*0], m1
+ psrlw m2, 2
+ movhps [dstq+strideq*1], m1
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w8:
+ mova [dstq+strideq*0], m0
+ paddw m2, m3
+ phaddw m2, m2
+ mova [dstq+strideq*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 8
+.w16:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movq [maskq], m2
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*0+16*2], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ paddw m2, [dstq+strideq*1+16*3]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*2
+.w64:
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*2], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*1+16*4], m3
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*4], m0
+ mova [dstq+strideq*1+16*6], m3
+ mova [dstq+strideq*0+16*5], m1
+ call .main
+ mova [dstq+strideq*0+16*6], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*1]
+ paddw m3, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*3]
+ paddw m3, [dstq+strideq*1+16*4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*5]
+ paddw m3, [dstq+strideq*1+16*6]
+ mova [dstq+strideq*1+16*4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*6], m2
+ mova [dstq+strideq*1+16*5], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*6]
+ paddw m2, [dstq+strideq*1+16*7]
+ mova [dstq+strideq*1+16*6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*4
+.w128:
+ mova [dstq+strideq*1+16* 1], m2
+ mova [dstq+strideq*0+16* 0], m0
+ mova [dstq+strideq*1+16* 2], m3
+ mova [dstq+strideq*0+16* 1], m1
+ call .main
+ mova [dstq+strideq*1+16* 3], m2
+ mova [dstq+strideq*0+16* 2], m0
+ mova [dstq+strideq*1+16* 4], m3
+ mova [dstq+strideq*0+16* 3], m1
+ call .main
+ mova [dstq+strideq*1+16* 5], m2
+ mova [dstq+strideq*0+16* 4], m0
+ mova [dstq+strideq*1+16* 6], m3
+ mova [dstq+strideq*0+16* 5], m1
+ call .main
+ mova [dstq+strideq*1+16* 7], m2
+ mova [dstq+strideq*0+16* 6], m0
+ mova [dstq+strideq*1+16* 8], m3
+ mova [dstq+strideq*0+16* 7], m1
+ call .main
+ mova [dstq+strideq*1+16* 9], m2
+ mova [dstq+strideq*0+16* 8], m0
+ mova [dstq+strideq*1+16*10], m3
+ mova [dstq+strideq*0+16* 9], m1
+ call .main
+ mova [dstq+strideq*1+16*11], m2
+ mova [dstq+strideq*0+16*10], m0
+ mova [dstq+strideq*1+16*12], m3
+ mova [dstq+strideq*0+16*11], m1
+ call .main
+ mova [dstq+strideq*1+16*13], m2
+ mova [dstq+strideq*0+16*12], m0
+ mova [dstq+strideq*1+16*14], m3
+ mova [dstq+strideq*0+16*13], m1
+ call .main
+ mova [dstq+strideq*0+16*14], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*15], m2
+ mova [dstq+strideq*0+16*15], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 1]
+ paddw m3, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 2], m2
+ mova [dstq+strideq*1+16* 1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 3]
+ paddw m3, [dstq+strideq*1+16* 4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 5]
+ paddw m3, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 6], m2
+ mova [dstq+strideq*1+16* 5], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 7]
+ paddw m3, [dstq+strideq*1+16* 8]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 9]
+ paddw m3, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16* 8], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*10], m2
+ mova [dstq+strideq*1+16* 9], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*11]
+ paddw m3, [dstq+strideq*1+16*12]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16*10], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*11], m1
+ packuswb m3, m2
+ mova [maskq+16*2], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*13]
+ paddw m3, [dstq+strideq*1+16*14]
+ mova [dstq+strideq*1+16*12], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*14], m2
+ mova [dstq+strideq*1+16*13], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*14]
+ paddw m2, [dstq+strideq*1+16*15]
+ mova [dstq+strideq*1+16*14], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*15], m1
+ packuswb m3, m2
+ mova [maskq+16*3], m3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2 ; dst/tmp_offset, mask
+ mova m%1, [tmp1q+16*%1]
+ mova m%2, [tmp2q+16*%1]
+ punpcklwd m4, m%2, m%1
+ punpckhwd m5, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m6, m8, m%1
+ psrlw m6, 10 ; 64-m
+ psubw m%2, m9, m6 ; m
+ punpcklwd m%1, m6, m%2
+ punpckhwd m6, m%2
+ pmaddwd m%1, m4
+ pmaddwd m6, m5
+ psrad m%1, 5
+ psrad m6, 5
+ packssdw m%1, m6
+ pmaxsw m%1, m10
+ psubsw m%1, m10
+ pmulhw m%1, m11
+%endmacro
+ W_MASK 0, 2
+ W_MASK 1, 3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ ret
+
+cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m7, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+%endif
+ pxor m0, m0
+ add wq, t0
+ pshufb m7, m0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ phaddw m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ packuswb m2, m2
+ pxor m3, m3
+ psubb m2, m7
+ pavgb m2, m3
+ movq [maskq], m2
+ add maskq, 8
+ ret
+
+cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m7, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ %define m11 m7
+%endif
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ packuswb m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ mova [maskq], m2
+ add maskq, 16
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ mova m7, [base+pw_m512]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ pxor m6, m6
+ jmp wq
+.w4:
+ mova m5, [maskq]
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ movq m1, [dstq+strideq*2]
+ movhps m1, [dstq+stride3q ]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m5, [maskq]
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m5, [maskq]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m5, [maskq+16*0]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova m5, [maskq+16*1]
+ mova m0, [dstq+16*2]
+ mova m1, [dstq+16*3]
+ psubw m2, m0, [tmpq+16*2]
+ psubw m3, m1, [tmpq+16*3]
+ add maskq, 32
+ add tmpq, 64
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ movd m4, [base+obmc_masks+2*2]
+.w2_loop:
+ movd m0, [dstq+strideq*0]
+ movd m2, [tmpq+4*0]
+ movd m1, [dstq+strideq*1]
+ movd m3, [tmpq+4*1]
+ add tmpq, 4*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m2, [base+obmc_masks+4*2]
+.w4_loop:
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m4, [base+obmc_masks+8*2]
+.w8_loop:
+ mova m0, [dstq+strideq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+strideq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks+16*2]
+ movq m5, [base+obmc_masks+16*3]
+.w16_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+8], m6
+%endif
+ mova m4, [base+obmc_masks+16*4]
+ mova m5, [base+obmc_masks+16*5]
+ mova m6, [base+obmc_masks+16*6]
+.w32_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ mova m2, [dstq+16*2]
+ paddw m1, m3
+ mova m3, [tmpq+16*2]
+ add tmpq, 16*4
+ psubw m3, m2
+ pmulhrsw m3, m6
+ paddw m2, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps m6, [rsp+8]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+16*(%1+0)]
+ mova m2, [tmpq+16*(%2+0)]
+ mova m1, [dstq+16*(%1+1)]
+ mova m3, [tmpq+16*(%2+1)]
+%if %3
+ add tmpq, 16*%3
+%endif
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*(%1+0)], m0
+ mova [dstq+16*(%1+1)], m1
+%endmacro
+
+cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_ssse3_table
+ LEA r6, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+blend_shuf]
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ movd m3, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpckldq m0, m2
+ punpcklwd m3, m3
+ psubw m1, m0
+ pmulhrsw m1, m3
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [base+blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movddup m5, [base+blend_shuf+8]
+%if WIN64
+ movaps [rsp+ 8], m6
+ movaps [rsp+24], m7
+%endif
+.w8_loop:
+ movd m7, [maskq+hq*2]
+ mova m0, [dstq+dsq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+dsq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ pshufb m6, m7, m4
+ psubw m2, m0
+ pshufb m7, m5
+ psubw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+%if WIN64
+ movaps m6, [rsp+ 8]
+ movaps m7, [rsp+24]
+%endif
+ RET
+.w16:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w16
+ RET
+.w32:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 8
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 16
+ BLEND_H_ROW 8, -8
+ BLEND_H_ROW 10, -6
+ BLEND_H_ROW 12, -4
+ BLEND_H_ROW 14, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ lea reg_src, [reg_src+reg_tmp*2]
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, mmsize/2
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3*2]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3*2]
+ %endif
+%if %1
+ movu [reg_tmp+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, mmsize/2
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ lea reg_tmp, [reg_tmp+centerwq*2]
+%else
+ lea reg_tmp, [dstq+centerwq*2]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq*2-2]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq*2-2]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3*2], m0
+ add r3, mmsize/2
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1*2]
+%endif
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%else
+cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m4, pxmaxm
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+ mova [rsp+16*3*ARCH_X86_32], m4
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+ %define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+ %define hd dword r5m
+ %if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+ %define base r6-$$
+ %else
+ LEA r4, $$
+ %define base r4-$$
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m12, [base+pd_64]
+ mova m11, [base+pd_63]
+%else
+ %define m12 [base+pd_64]
+ %define m11 [base+pd_63]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 15, 0
+ SCRATCH 6, 14, 1
+ SCRATCH 5, 13, 2
+ pxor m1, m1
+.loop_y:
+ xor xd, xd
+ mova m0, m14 ; per-line working version of mx
+.loop_x:
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m13, m1
+ pand m1, m3
+ pandn m3, m13
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m11 ; filter offset (masked)
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movu m4, [srcq+r8*2]
+ movu m5, [srcq+r9*2]
+ movu m6, [srcq+r10*2]
+ movu m7, [srcq+r11*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ packssdw m3, m3
+ movq r11, m3
+ test r11, r11
+ jz .filter
+ movsx r8, r11w
+ sar r11, 16
+ movsx r9, r11w
+ sar r11, 16
+ movsx r10, r11w
+ sar r11, 16
+ movu m1, [base+resize_shuf+8+r8*2]
+ movu m3, [base+resize_shuf+8+r9*2]
+ movu m8, [base+resize_shuf+8+r10*2]
+ movu m9, [base+resize_shuf+8+r11*2]
+ pshufb m4, m1
+ pshufb m5, m3
+ pshufb m6, m8
+ pshufb m7, m9
+.filter:
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m8, [base+resize_filter+r8*8]
+ movq m2, [base+resize_filter+r9*8]
+ pxor m9, m9
+ punpcklbw m1, m9, m8
+ punpcklbw m3, m9, m2
+ psraw m1, 8
+ psraw m3, 8
+ movq m10, [base+resize_filter+r10*8]
+ movq m2, [base+resize_filter+r11*8]
+ punpcklbw m8, m9, m10
+ punpcklbw m9, m2
+ psraw m8, 8
+ psraw m9, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ pmaddwd m6, m8
+ pmaddwd m7, m9
+ phaddd m4, m5
+%else
+ movd r3, m1
+ pshuflw m1, m1, q3232
+ movd r1, m1
+ punpckhqdq m1, m1
+ movu m4, [srcq+r3*2]
+ movu m5, [srcq+r1*2]
+ movd r3, m1
+ psrlq m1, 32
+ movd r1, m1
+ movu m6, [srcq+r3*2]
+ movu m7, [srcq+r1*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ pxor m1, m1
+ pcmpeqb m1, m3
+ pmovmskb r3d, m1
+ cmp r3d, 0xffff
+ je .filter
+ movd r3, m3
+ movu m1, [base+resize_shuf+8+r3*2]
+ pshuflw m3, m3, q3232
+ movd r1, m3
+ pshufb m4, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ punpckhqdq m3, m3
+ movd r3, m3
+ pshufb m5, m1
+ movu m1, [base+resize_shuf+8+r3*2]
+ psrlq m3, 32
+ movd r1, m3
+ pshufb m6, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ pshufb m7, m1
+.filter:
+ mova [esp+4*16], m6
+ mova [esp+5*16], m7
+ movd r3, m2
+ pshuflw m2, m2, q3232
+ movd r1, m2
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ pxor m3, m3
+ punpcklbw m1, m3, m6
+ punpcklbw m3, m7
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ punpckhqdq m2, m2
+ movd r3, m2
+ psrlq m2, 32
+ movd r1, m2
+ phaddd m4, m5
+ movq m2, [base+resize_filter+r3*8]
+ movq m5, [base+resize_filter+r1*8]
+ mova m6, [esp+4*16]
+ mova m7, [esp+5*16]
+ pxor m3, m3
+ punpcklbw m1, m3, m2
+ punpcklbw m3, m5
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m6, m1
+ pmaddwd m7, m3
+%endif
+ phaddd m6, m7
+ phaddd m4, m6
+ pxor m1, m1
+ psubd m2, m12, m4
+ psrad m2, 7
+ packssdw m2, m2
+ pmaxsw m2, m1
+ pminsw m2, [rsp+16*3*ARCH_X86_32]
+ movq [dstq+xq*2], m2
+ paddd m0, m15
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
new file mode 100644
index 0000000000..3b208033bd
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -0,0 +1,5669 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018-2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+ db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+ db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
+bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+
+wm_420_sign: dd 0x01020102, 0x01010101
+wm_422_sign: dd 0x80808080, 0x7f7f7f7f
+
+pb_64: times 4 db 64
+pw_m256: times 2 dw -256
+pw_15: times 2 dw 15
+pw_32: times 2 dw 32
+pw_34: times 2 dw 34
+pw_258: times 2 dw 258
+pw_512: times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+pd_0x3ff: dd 0x3ff
+pd_0x4000: dd 0x4000
+pq_0x40000000: dq 0x40000000
+
+cextern mc_subpel_filters
+cextern mc_warp_filter2
+cextern resize_filter
+
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx2]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+INIT_YMM avx2
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xm0, [srcq+ssq*0]
+ pinsrd xm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pshufb xm1, xm4
+ pmaddubsw xm0, xm5
+ pmaddubsw xm1, xm5
+ pmulhrsw xm0, xm3
+ pmulhrsw xm1, xm3
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+8*4]
+ movu m2, [srcq+8*5]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -32*3
+.h_w128_loop:
+ movu m0, [srcq+r6+32*3+8*0]
+ movu m1, [srcq+r6+32*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+32*3], m0
+ add r6, 32
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 255
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16
+ add wq, r7
+ movd xm4, mxyd
+ vpbroadcastw m4, xm4
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xm1, xm1, q2301 ; 1 0
+ punpcklbw xm1, xm0
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 1
+ pextrw [dstq+dsq*1], xm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm1, xm2, xm0, 0x01 ; 0 1
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm2, xm0, 0x02 ; 1 2
+ punpcklbw xm1, xm2
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xm1, xm0, xm2
+ movq xm0, [srcq+ssq*0]
+ punpcklbw xm2, xm0
+ pmaddubsw xm1, xm4
+ pmaddubsw xm2, xm4
+ pmulhrsw xm1, xm5
+ pmulhrsw xm2, xm5
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m3, m0, 0x0f ; 0 1
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m3, m0, 0xf0 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ PUT_BILIN_V_W32
+ RET
+.v_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+.v_w64_loop:
+ add srcq, ssq
+ movu m3, [srcq+32*0]
+ punpcklbw m2, m0, m3
+ punpckhbw m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m0, m4
+ pmulhrsw m2, m5
+ pmulhrsw m0, m5
+ packuswb m2, m0
+ mova m0, m3
+ movu m3, [srcq+32*1]
+ mova [dstq+32*0], m2
+ punpcklbw m2, m1, m3
+ punpckhbw m1, m3
+ pmaddubsw m2, m4
+ pmaddubsw m1, m4
+ pmulhrsw m2, m5
+ pmulhrsw m1, m5
+ packuswb m2, m1
+ mova m1, m3
+ mova [dstq+32*1], m2
+ add dstq, dsq
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ lea r6d, [hq+(3<<8)]
+ mov r4, srcq
+ mov r7, dstq
+.v_w128_loop:
+ PUT_BILIN_V_W32
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_15]
+ movd xm6, mxyd
+ add wq, r7
+ paddb m5, m5
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w2:
+ vpbroadcastd xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xm1, [srcq+ssq*0], 1
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 _ 2 _
+ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ pmulhw xm1, xm6
+ pavgw xm2, xm7
+ paddw xm1, xm2
+ psrlw xm1, 4
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 0
+ pextrw [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xm4, [bilin_h_shuf4]
+ movddup xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 2
+ shufps xm2, xm0, xm1, q1032 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ pmulhw xm1, xm6
+ pavgw xm2, xm7
+ paddw xm1, xm2
+ psrlw xm1, 4
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhw m1, m6
+ pavgw m2, m7
+ paddw m1, m2
+ psrlw m1, 4
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm2, [srcq+ssq*1+8*0]
+ vinserti128 m2, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0+8*0]
+ vinserti128 m3, [srcq+ssq*0+8*1], 1
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ pmulhw m1, m6
+ pavgw m0, m7
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ pmulhw m3, m6
+ pavgw m2, m7
+ paddw m3, m2
+ psrlw m1, 4
+ psrlw m3, 4
+ packuswb m1, m3
+ vpermq m1, m1, q3120
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w128:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w32_start
+.hv_w64:
+ lea r6d, [hq+(1<<16)]
+.hv_w32_start:
+ mov r4, srcq
+ mov r7, dstq
+.hv_w32:
+%if WIN64
+ movaps r4m, xmm8
+%endif
+.hv_w32_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ pmulhw m8, m6
+ pavgw m0, m7
+ paddw m8, m0
+ mova m0, m2
+ psubw m2, m3, m1
+ pmulhw m2, m6
+ pavgw m1, m7
+ paddw m2, m1
+ mova m1, m3
+ psrlw m8, 4
+ psrlw m2, 4
+ packuswb m8, m2
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w32_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<16
+ jg .hv_w32_loop0
+%if WIN64
+ movaps xmm8, r4m
+%endif
+ RET
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep%+SUFFIX]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xm0, [srcq+strideq*0]
+ pinsrd xm0, [srcq+strideq*1], 1
+ pinsrd xm0, [srcq+strideq*2], 2
+ pinsrd xm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ pmovzxbw m1, xm1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0+16*0]
+ pmovzxbw m1, [srcq+strideq*0+16*1]
+ pmovzxbw m2, [srcq+strideq*1+16*0]
+ pmovzxbw m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmovzxbw m0, [srcq+16*4]
+ pmovzxbw m1, [srcq+16*5]
+ pmovzxbw m2, [srcq+16*6]
+ pmovzxbw m3, [srcq+16*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, xm1, 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+.h_w8_loop:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+.h_w16_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ movu xm2, [srcq+strideq*2+8*0]
+ vinserti128 m2, [srcq+strideq*2+8*1], 1
+ movu xm3, [srcq+stride3q +8*0]
+ vinserti128 m3, [srcq+stride3q +8*1], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+.h_w32_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*0+8*2]
+ vinserti128 m1, [srcq+strideq*0+8*3], 1
+ movu xm2, [srcq+strideq*1+8*0]
+ vinserti128 m2, [srcq+strideq*1+8*1], 1
+ movu xm3, [srcq+strideq*1+8*2]
+ vinserti128 m3, [srcq+strideq*1+8*3], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ movu xm0, [srcq+8* 8]
+ vinserti128 m0, [srcq+8* 9], 1
+ movu xm1, [srcq+8*10]
+ vinserti128 m1, [srcq+8*11], 1
+ movu xm2, [srcq+8*12]
+ vinserti128 m2, [srcq+8*13], 1
+ movu xm3, [srcq+8*14]
+ vinserti128 m3, [srcq+8*15], 1
+ add tmpq, 32*8
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 255
+ add mxyd, 16
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ jmp wq
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x05 ; 0 2 2 2
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpblendd m3, m2, 0x0f ; 1 1 3 3
+ vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
+ vpblendd m1, m3, 0xaa ; 0 1 2 3
+ vpblendd m2, m3, 0x55 ; 1 2 3 4
+ punpcklbw m1, m2
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+strideq*1]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m2, m3, 0xcc ; 1 3 1 3
+ vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2
+ vpblendd m2, m1, 0x0f ; 0 2 1 3
+ vpblendd m3, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti128 m0, [srcq+strideq*0]
+.v_w16_loop:
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ shufpd m4, m0, m2, 0x0c ; 0 2
+ vbroadcasti128 m0, [srcq+strideq*0]
+ shufpd m1, m3, 0x0c ; 1 3
+ shufpd m2, m0, 0x0c ; 2 4
+ punpcklbw m3, m4, m1
+ punpcklbw m5, m1, m2
+ punpckhbw m4, m1
+ punpckhbw m1, m2
+ pmaddubsw m3, m6
+ pmaddubsw m5, m6
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m5
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w32_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ vpermq m2, [srcq+strideq*2], q3120
+ vpermq m3, [srcq+stride3q ], q3120
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ punpcklbw m5, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m1
+ add tmpq, 32*8
+ punpcklbw m1, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m5
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m1
+ mova [tmpq-32*1], m3
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+ vpermq m2, [srcq+strideq*1+32*0], q3120
+ vpermq m3, [srcq+strideq*1+32*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m0
+ punpcklbw m4, m1, m3
+ punpckhbw m5, m1, m3
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m5, m3, m1
+ punpckhbw m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m5, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m5
+ mova [tmpq-32*1], m3
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ lea r6d, [hq+(3<<8)]
+ mov r3, srcq
+ mov r5, tmpq
+.v_w128_loop0:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ punpcklbw m4, m1, m0
+ punpckhbw m1, m0
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m2
+ mova [tmpq+32*1], m3
+ mova [tmpq+32*8], m4
+ mova [tmpq+32*9], m1
+ add tmpq, 32*16
+ sub hd, 2
+ jg .v_w128_loop
+ add r3, 32
+ add r5, 64
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w128_loop0
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+ vpbroadcastq m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq xm1, [srcq+strideq*1]
+ movhps xm1, [srcq+strideq*2]
+ movq xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movhps xm2, [srcq+strideq*0]
+ vinserti128 m1, xm2, 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ vpblendd m2, m1, m0, 0xc0
+ vpermq m2, m2, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m3, m0, m1, 0x21 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vperm2i128 m2, m1, m0, 0x21 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, strideq
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, [srcq+8*1], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ movu xm2, [srcq+8*2]
+ vinserti128 m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ mova [tmpq+32*0], m3
+ psubw m3, m2, m1
+ pmulhrsw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ dec hd
+ jg .hv_w32_loop
+ RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r6d, 256
+ jmp .hv_w64_start
+.hv_w64:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 128
+.hv_w64_start:
+%if WIN64
+ PUSH r7
+%endif
+ mov r5, srcq
+ mov r7, tmpq
+.hv_w64_loop0:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w64_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r6*0], m3
+ mova [tmpq+r6*1], m2
+ lea tmpq, [tmpq+r6*2]
+ sub hd, 2
+ jg .hv_w64_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r3b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r3d, 1<<8
+ jg .hv_w64_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xm4, [subpel_h_shuf4]
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm3
+ phaddw xm0, xm0
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm6
+ pshufb xm1, xm6
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ phaddw xm0, xm1
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters-put_avx2]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrw xm2, [srcq+ssq*1], 2
+ pinsrw xm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w4_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m9 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, m10 ; a2 b2
+ paddw m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, m11 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ vextracti128 xm4, m5, 1
+ packuswb xm5, xm4
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-128]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*2]
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ vbroadcasti128 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m3, [srcq+ssq*0]
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m13, [srcq+ssq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ vpermq m14, m14, q3120
+ mova [dstq+dsq*0], xm14
+ vextracti128 [dstq+dsq*1], m14, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m6, [subpel_h_shuf4]
+ movq xm2, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*1]
+ movq xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ movhps xm0, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m2, m3, 0x30
+ vpblendd m0, m1, 0x30
+ vpblendd m2, m4, 0xc0
+ pshufb m2, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ phaddw m2, m0
+ pmulhrsw m2, m8
+ vextracti128 xm3, m2, 1
+ palignr xm4, xm3, xm2, 4
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+ pshufd xm0, xm3, q2121
+ punpcklwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
+ pmaddwd xm5, xm1, xm10 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm11 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm12 ; a2 b2
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm8
+ paddd xm5, xm3
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm3, xm13 ; a3 b3
+ paddd xm5, xm9
+ paddd xm5, xm4
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m4, 0xcc ; 4 5
+ pshufb m2, m6
+ pshufb m0, m6
+ pshufb m3, m6
+ pshufb m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ pmaddubsw m3, m7
+ pmaddubsw m1, m7
+ phaddw m2, m0
+ phaddw m3, m1
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ palignr m4, m3, m2, 4
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m1, m10 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m11 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m12 ; a2 b2
+ paddd m5, m3
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpblendd m4, m3, 0xcc ; 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, m8
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; 67 78
+ pmaddwd m4, m3, m13 ; a3 b3
+ paddd m5, m9
+ paddd m5, m4
+ psrad m5, 10
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ packuswb xm5, xm5
+ pshuflw xm5, xm5, q3120
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ movu xm4, [srcq+ssq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ movu xm5, [srcq+ssq*1]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
+ vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
+ add srcq, ss3q
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ pshufb %3, %1, %6
+ pshufb %4, %1, %7
+ pshufb %1, %5
+ pmaddubsw %2, %3, m10
+ pmaddubsw %4, m11
+ pmaddubsw %3, m11
+ pmaddubsw %1, m10
+ paddw %2, %4
+ paddw %1, %3
+ phaddw %1, %2
+%endmacro
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 r6m, m0, 1 ; not enough registers
+ movu xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_512]
+ vbroadcasti128 m6, r6m
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 10
+ psrad m7, 10
+ packssdw m8, m7
+ vextracti128 xm7, m8, 1
+ packuswb xm8, xm7
+ pshufd xm7, xm8, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r4, 8
+ add r7, 8
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m5
+ pshufb m2, m0, m6
+ pshufb m3, m0, m7
+ pmaddubsw m1, m8
+ pmaddubsw m0, m2, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m0, m3
+ phaddw m0, m1, m0
+ pmulhrsw m0, m4
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep%+SUFFIX]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq m2, [srcq+strideq*2]
+ movq xm1, [srcq+strideq*1]
+ vpblendd m0, m2, 0xf0
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m2, 0xf0
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m6
+ pmaddubsw m1, m6
+ phaddw m0, m1
+ pmulhrsw m0, m4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ add srcq, stride3q
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*1]
+ vpbroadcastd m2, [srcq+strideq*2]
+ vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ lea srcq, [srcq+strideq*4]
+ pinsrd xm0, [srcq+strideq*0], 1
+ vpbroadcastd m3, [srcq+strideq*1]
+ vpbroadcastd m4, [srcq+strideq*2]
+ vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _
+ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m5 ; 67 78 89 9a
+ pmaddubsw m4, m1, m8
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m9
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m11
+ paddw m3, m4
+ pmaddubsw m4, m1, m10
+ paddw m3, m4
+ pmulhrsw m3, m7
+ mova [tmpq], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m5, m2, m9 ; a1
+ pmaddubsw m6, m2, m8 ; b0
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 67 78
+ pmaddubsw m1, m8 ; a0
+ pmaddubsw m4, m3, m9 ; b1
+ paddw m5, m1
+ mova m1, m3
+ pmaddubsw m3, m10 ; a2
+ paddw m6, m4
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 89 9a
+ pmaddubsw m4, m2, m11 ; a3
+ paddw m5, m4
+ pmaddubsw m4, m2, m10 ; b2
+ paddw m6, m4
+ pmaddubsw m4, m3, m11 ; b3
+ paddw m6, m4
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ mova [tmpq+32*0], m5
+ mova [tmpq+32*1], m6
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ add wd, wd
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+wq*8-256]
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*0]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+strideq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m13, [srcq+strideq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova [tmpq+wq*0], m14
+ mova [tmpq+wq*1], m15
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp .hv_w8
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mova m7, [subpel_h_shuf4]
+ pmovzxbd m9, [deint_shuf4]
+ vpbroadcastd m10, [pw_8192]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ vpbroadcastq m2, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m6, 0xcc ; 4 5
+ pshufb m2, m7 ; 00 01 10 11 02 03 12 13
+ pshufb m0, m7 ; 20 21 30 31 22 23 32 33
+ pshufb m3, m7 ; 40 41 50 51 42 43 52 53
+ pshufb m1, m7 ; 60 61 60 61 62 63 62 63
+ pmaddubsw m2, m8
+ pmaddubsw m0, m8
+ pmaddubsw m3, m8
+ pmaddubsw m1, m8
+ phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b
+ phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m12 ; a0 b0
+ pmaddwd m6, m2, m12 ; c0 d0
+ pmaddwd m2, m13 ; a1 b1
+ pmaddwd m4, m3, m13 ; c1 d1
+ mova m1, m3
+ pmaddwd m3, m14 ; a2 b2
+ paddd m5, m2
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ paddd m6, m4
+ vpbroadcastq m4, [srcq+strideq*0]
+ paddd m5, m3
+ vpbroadcastq m3, [srcq+strideq*1]
+ vpblendd m2, m4, 0xcc
+ vpbroadcastq m4, [srcq+strideq*2]
+ vpblendd m3, m4, 0xcc
+ pshufb m2, m7
+ pshufb m3, m7
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ phaddw m2, m3
+ pmulhrsw m2, m10
+ palignr m3, m2, m0, 12
+ mova m0, m2
+ punpcklwd m2, m3, m0 ; 67 78
+ punpckhwd m3, m0 ; 89 9a
+ pmaddwd m4, m2, m14 ; c2 d2
+ paddd m6, m11
+ paddd m5, m11
+ paddd m6, m4
+ pmaddwd m4, m2, m15 ; a3 b3
+ paddd m5, m4
+ pmaddwd m4, m3, m15 ; c3 d3
+ paddd m6, m4
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermd m5, m9, m5
+ mova [tmpq], m5
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+r6*4]
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ movu xm4, [srcq+strideq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ movu xm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+strideq*0]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+strideq*0], 1 ; 1 4
+ vinserti128 m6, [srcq+strideq*1], 1 ; 2 5
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 6
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 [tmpq], m0, 1 ; not enough registers
+ movu xm0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_32]
+ vbroadcasti128 m6, [tmpq]
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 6
+ psrad m7, 6
+ packssdw m8, m7
+ vpermq m7, m8, q3120
+ mova [tmpq+wq*0], xm7
+ vextracti128 [tmpq+wq*2], m7, 1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 8
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movq xm%1, [srcq+ r4]
+ movq xm%2, [srcq+ r6]
+ movhps xm%1, [srcq+ r7]
+ movhps xm%2, [srcq+ r9]
+ vinserti128 m%1, [srcq+r10], 1
+ vinserti128 m%2, [srcq+r11], 1
+ vpbroadcastq m%5, [srcq+r13]
+ vpbroadcastq m%6, [srcq+ rX]
+ add srcq, ssq
+ movq xm%3, [srcq+ r4]
+ movq xm%4, [srcq+ r6]
+ movhps xm%3, [srcq+ r7]
+ movhps xm%4, [srcq+ r9]
+ vinserti128 m%3, [srcq+r10], 1
+ vinserti128 m%4, [srcq+r11], 1
+ vpbroadcastq m%7, [srcq+r13]
+ vpbroadcastq m%8, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m%1, m%5, 0xc0
+ vpblendd m%2, m%6, 0xc0
+ vpblendd m%3, m%7, 0xc0
+ vpblendd m%4, m%8, 0xc0
+ pmaddubsw m%1, m15
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m15
+ pmaddubsw m%4, m10
+ phaddw m%1, m%2
+ phaddw m%3, m%4
+ phaddw m%1, m%3
+ pmulhrsw m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+120]
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+ lea base_reg, [%1_8tap_scaled_8bpc_avx2]
+%define base base_reg-%1_8tap_scaled_8bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm14, mxd
+ vpbroadcastd m14, xm14
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+ vpbroadcastd m14, mxm
+%endif
+ mov dyd, dym
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+112]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+112]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ vpbroadcastd m10, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+pw_8192]
+%ifidn %1, put
+ vpbroadcastd m13, [base+pd_512]
+%else
+ vpbroadcastd m13, [base+pd_32]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0,1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m15, m7, 0xaa
+ vpblendd m0, m2, 0xc0 ; 0 1 4 5
+ vpblendd m1, m3, 0xc0 ; 2 3 6 7
+ pblendvb m15, m11, m8
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1 ; 4 5 6 7
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ pmovsxbw xm11, xm11
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ pmaddwd xm8, xm4, xm11
+ paddd xm5, xm6
+ paddd xm7, xm8
+ paddd xm5, xm13
+ paddd xm5, xm7
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq], xm5, 0
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movq xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movhps xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd xm15, xm0
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m0, m9
+ psrld m14, 10
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m15, xm15, 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pblendvb m15, m11, m0
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ pmulhrsw m7, m12 ; 0 1 4 5
+ pmulhrsw m8, m12 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm11, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm11 ; 67
+ mova [rsp+0x00], xm7
+ mova [rsp+0x10], xm8
+ mova [rsp+0x20], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm10, r6q
+ pmovsxbw xm10, xm10
+ pshufd xm7, xm10, q0000
+ pshufd xm8, xm10, q1111
+ pshufd xm9, xm10, q2222
+ pshufd xm10, xm10, q3333
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pmaddwd xm6, xm2, xm9
+ pmaddwd xm7, xm3, xm10
+ paddd xm4, xm5
+ paddd xm6, xm7
+ paddd xm4, xm13
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq], xm4
+ add dstq, dsq
+%else
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ movu xm4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x00]
+ mova [rsp+0x00], xm1
+ mova xm1, [rsp+0x10]
+ mova [rsp+0x10], xm2
+ mova xm2, [rsp+0x20]
+ mova [rsp+0x20], xm3
+ pshufb xm4, xm14
+ pmaddubsw xm4, xm15
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm12
+ punpcklwd xm3, xm11, xm4
+ mova xm11, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm5, [srcq+ssq*1]
+ movu m6, [rsp+0x10]
+ pshufb xm4, xm14
+ pshufb xm5, xm14
+ pmaddubsw xm4, xm15
+ pmaddubsw xm5, xm15
+ movu [rsp+0x00], m6
+ phaddw xm4, xm5
+ pmulhrsw xm4, xm12
+ punpcklwd xm9, xm11, xm4
+ mova [rsp+0x20], xm9
+ psrldq xm11, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm11
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+.w8:
+ mov dword [rsp+48], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+48], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+48], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+48], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+48], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+72], t0d
+ mov [rsp+56], srcq
+ mov [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+48]
+ jz .ret
+ add qword [rsp+64], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+16]
+ vpbroadcastd m15, [rsp+72]
+ pxor m9, m9
+ mov srcq, [rsp+56]
+ mov r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+16], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ mova [rsp], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklqdq xm11, xm11
+ pmovsxbw m11, xm11
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+52], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .vloop
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ add srcq, ss3q
+ movq xm10, r4q
+ pmovsxbw xm10, xm10
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ pshufd xm8, xm10, q0000
+ pshufd xm9, xm10, q1111
+ pshufd xm11, xm10, q3333
+ pshufd xm10, xm10, q2222
+ vpblendd m0, m2, 0xc0
+ pshufb m1, m14
+ pshufb m0, m14
+ pmaddubsw m1, m15
+ pmaddubsw m0, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ mova xm3, xm0
+ mova xm0, xm2
+ paddd xm5, xm13
+ paddd xm6, xm7
+ pshufb xm1, xm14
+ pmaddubsw xm1, xm15
+ phaddw xm1, xm1
+ pmulhrsw xm1, xm12
+ palignr xm7, xm1, xm4, 12
+ punpcklwd xm2, xm7, xm1 ; 67 78
+ pmaddwd xm7, xm2, xm11
+ mova xm4, xm1
+ paddd xm5, xm6
+ paddd xm5, xm7
+ psrad xm5, rndshift
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ vpermq m8, m8, q3120
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r11d, xm15, 1
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ movu xm2, [srcq+ssq*0]
+ movu xm3, [srcq+ssq*2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 1
+ vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20
+ vinserti128 m2, [srcq+ssq*1], 1
+ vinserti128 m3, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m4, [srcq+ssq*1], 1
+ add srcq, ss3q
+ vpblendd m15, m7, 0x30
+ punpcklqdq m15, m15
+ pblendvb m15, m11, m8
+ movq xm10, r4q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb xm5, xm14
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m2, m3
+ phaddw m4, m5
+ pmulhrsw m2, m12
+ pmulhrsw m4, m12
+ palignr m5, m4, m2, 4
+ pshufd m3, m4, q2121
+ punpcklwd m0, m2, m5 ; 01 12
+ punpckhwd m1, m2, m5 ; 23 34
+ punpcklwd m2, m4, m3 ; 45 56
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ vinserti128 m11, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ mova m0, m1
+ mova m1, m2
+ paddd m4, m13
+ paddd m5, m6
+ pshufb m11, m14
+ vpermq m11, m11, q3120
+ pmaddubsw m11, m15
+ phaddw m11, m11
+ pmulhrsw m11, m12
+ palignr m6, m11, m3, 12
+ punpcklwd m2, m6, m11 ; 67 78
+ mova m3, m11
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ pshuflw xm4, xm4, q3120
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ pshufd xm4, xm4, q3120
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+.dy1_w8:
+ mov dword [rsp+72], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+72], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+72], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+72], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+72], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+76], t0d
+ mov [rsp+80], srcq
+ mov [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+96], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+72]
+ jz .ret
+ add qword [rsp+88], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+32]
+ vpbroadcastd m15, [rsp+76]
+ pxor m9, m9
+ mov srcq, [rsp+80]
+ mov r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+32], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+64], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ movu [rsp], m10
+ vpbroadcastd m8, [rsp+0x60]
+ vpbroadcastd m9, [rsp+0x64]
+ vpbroadcastd m10, [rsp+0x68]
+ vpbroadcastd m11, [rsp+0x6c]
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*1]
+ movhps xm0, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ movhps xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vpblendd m0, m2, 0x30
+ vpblendd m1, m4, 0xc0
+ vpblendd m0, m3, 0xc0
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ movq xm11, r4q
+ pmovsxbw xm11, xm11
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 01 23
+ punpckhwd xm2, xm1 ; 23 45
+.dy2_w2_loop:
+ movq xm6, [srcq+ssq*0]
+ vpbroadcastq m7, [srcq+ssq*1]
+ movhps xm6, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm3, xm8
+ pmaddwd xm5, xm2, xm9
+ vpblendd m6, m7, 0x30
+ vpblendd m6, m1, 0xc0
+ pshufb m6, m14
+ pmaddubsw m6, m15
+ phaddw m6, m6
+ pmulhrsw m6, m12
+ palignr m0, m6, m0, 8
+ pshufd m2, m0, q3221
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 45 67
+ punpckhwd xm2, xm1 ; 67 89
+ pmaddwd xm6, xm3, xm10
+ pmaddwd xm7, xm2, xm11
+ paddd xm4, xm5
+ paddd xm4, xm13
+ paddd xm6, xm7
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+ packuswb xm4, xm4
+ pextrw [dstq+dsq*0], xm4, 0
+ pextrw [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*2]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm1, [srcq+ssq*1]
+ movu xm3, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vinserti128 m15, xm15, 1
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m2, [srcq+ssq*0], 1
+ vinserti128 m3, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pblendvb m15, m11, m8
+ pshufb xm0, xm14
+ pshufb m2, m14
+ pshufb xm1, xm14
+ pshufb m3, m14
+ pmaddubsw xm0, xm15
+ pmaddubsw m2, m15
+ pmaddubsw xm1, xm15
+ pmaddubsw m3, m15
+ movq xm11, r4q
+ punpcklqdq xm11, xm11
+ pmovsxbw m11, xm11
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ punpcklwd xm2, xm0, xm1
+ punpckhwd m1, m0, m1 ; 23 45
+ vinserti128 m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+ movu xm6, [srcq+ssq*0]
+ movu xm7, [srcq+ssq*1]
+ vinserti128 m6, [srcq+ssq*2], 1
+ vinserti128 m7, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ psrld m2, m6, 16
+ pslld m3, m7, 16
+ paddw m6, m2
+ paddw m7, m3
+ pblendw m6, m7, 0xaa ; 67 89
+ pmulhrsw m6, m12
+ paddd m4, m5
+ vperm2i128 m0, m1, m6, 0x21 ; 45 67
+ mova m1, m6
+ pmaddwd m6, m0, m10
+ pmaddwd m7, m1, m11
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+.dy2_w8:
+ mov dword [rsp+40], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+40], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+40], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+40], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+40], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+64], t0d
+ mov [rsp+48], srcq
+ mov [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+40]
+ jz .ret
+ add qword [rsp+56], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp]
+ vpbroadcastd m15, [rsp+64]
+ pxor m9, m9
+ mov srcq, [rsp+48]
+ mov r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m11, [rsp+0x58]
+ vpbroadcastd m4, [rsp+0x5c]
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ SWAP m14, m4
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m14
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
+ ; Can be done using gathers, but that's terribly slow on many CPU:s
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklwd m8, m0
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m%2, m9
+ paddd m0, m8
+ paddd m%1, m0, m%2
+%endmacro
+
+cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
+%if WIN64
+ sub rsp, 0xa0
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
+.loop:
+ psrad m7, 13
+ psrad m0, 13
+ packssdw m7, m0
+ pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+
+cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+ beta, filter, tmp1, delta, my, gamma
+%if WIN64
+ sub rsp, 0xa0
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 0xa0
+ %assign stack_offset stack_offset+stack_size_padded
+%endif
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 18
+ psrad m0, 18
+ packusdw m7, m0
+ pavgw m7, m11 ; (x + (1 << 10)) >> 11
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ pshufd xm7, xm7, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+ movaps [rsp+stack_offset+0x10], xmm6
+ movaps [rsp+stack_offset+0x20], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+ movaps [rsp+0x98], xmm15
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ mova m12, [warp_8x8_shufA]
+ mova m13, [warp_8x8_shufB]
+ vpbroadcastd m14, [pw_8192]
+ vpbroadcastd m15, [pd_32768]
+ pxor m11, m11
+ lea filterq, [mc_warp_filter2]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ psrld m4, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 02
+ call .h
+ pblendw m4, m0, 0xaa ; 13
+ call .h
+ psrld m2, m1, 16
+ pblendw m2, m0, 0xaa ; 24
+ call .h
+ psrld m5, m4, 16
+ pblendw m5, m0, 0xaa ; 35
+ call .h
+ psrld m3, m2, 16
+ pblendw m3, m0, 0xaa ; 46
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m6, m5, 16
+ pblendw m6, m0, 0xaa ; 57
+ WARP_V 7, 1, 3, 4, 6
+ call .h
+ mova m1, m2
+ mova m2, m3
+ psrld m3, 16
+ pblendw m3, m0, 0xaa ; 68
+ WARP_V 0, 4, 6, 1, 3
+ mova m4, m5
+ mova m5, m6
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ vbroadcasti128 m10, [srcq]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+mxq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklqdq m8, m0 ; 0 1 4 5
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ punpcklqdq m9, m0 ; 2 3 6 7
+ pshufb m0, m10, m12
+ pmaddubsw m0, m8
+ pshufb m10, m13
+ pmaddubsw m10, m9
+ add srcq, ssq
+ phaddw m0, m10
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ cmp hd, 8
+ je .ret
+ %1 2
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.ret:
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 4
+ %1 0
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+32], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1 0
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+0*32], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+1*32], m0
+ %1_INC_PTR 8
+ %1 -4
+ vpermq m0, m0, q3120
+ mova [dstq+2*32], m0
+ %1 -2
+ vpermq m0, m0, q3120
+ mova [dstq+3*32], m0
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*32]
+ paddw m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ paddw m1, [tmp2q+(%1+1)*32]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*32
+ add tmp2q, %1*32
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg %+ SUFFIX %+ _table
+ lea r6, [avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m2, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*32]
+ psubw m2, m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ psubw m3, m1, [tmp2q+(%1+1)*32]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg %+ SUFFIX %+ _table
+ lea r6, [w_avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ vpermq m3, [maskq+%1*16], q3120
+ mova m0, [tmp2q+(%1+0)*32]
+ psubw m1, m0, [tmp1q+(%1+0)*32]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*32]
+ psubw m2, m1, [tmp1q+(%1+1)*32]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*16
+ add tmp2q, %1*32
+ add tmp1q, %1*32
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask %+ SUFFIX %+ _table
+ lea r7, [mask %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ pxor m4, m4
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+32*%3]
+ mova m1, [tmp2q+32*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+32*%4]
+ mova m2, [tmp2q+32*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ psrlw m3, 8
+%if %5
+ packuswb m%2, m3
+ psubb m%2, m5, m%2
+ vpermq m%2, m%2, q3120
+%else
+ phaddw m%2, m3
+%endif
+ psllw m3, 10
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pb_64]
+ vpbroadcastd m5, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ vpbroadcastd xm1, [dstq+dsq*2]
+ pinsrd xm1, [dstq+r6 ], 3
+ mova xm6, [maskq]
+ psubb xm3, xm4, xm6
+ punpcklbw xm2, xm3, xm6
+ punpckhbw xm3, xm6
+ mova xm6, [maskq+tmpq]
+ add maskq, 4*4
+ punpcklbw xm0, xm6
+ punpckhbw xm1, xm6
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ pextrd [dstq+dsq*2], xm0, 2
+ pextrd [dstq+r6 ], xm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ movq xm1, [dstq+dsq*0]
+ movhps xm1, [dstq+dsq*1]
+ vpbroadcastq m2, [dstq+dsq*2]
+ vpbroadcastq m3, [dstq+r6 ]
+ mova m0, [maskq]
+ mova m6, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd m1, m2, 0x30
+ vpblendd m1, m3, 0xc0
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ mova m0, [maskq]
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ mova m6, [maskq+tmpq]
+ add maskq, 16*2
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ mova m0, [maskq]
+ mova m1, [dstq]
+ mova m6, [maskq+tmpq]
+ add maskq, 32
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx2_table
+ jmp wq
+.w2:
+ vpbroadcastd xm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+ALIGN function_align
+.w4:
+ vpbroadcastq xm2, [maskq+4*2]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova xm3, [maskq+8*2]
+.w8_loop:
+ movq xm0, [dstq+dsq*0]
+ vpbroadcastq xm1, [dstq+dsq*1]
+ mova xm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xm0, xm2
+ punpckhbw xm1, xm2
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m3, [maskq+16*2]
+ vbroadcasti128 m4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ mova xm3, [maskq+16*4]
+ vinserti128 m3, [maskq+16*6], 1
+ mova xm4, [maskq+16*5]
+ vinserti128 m4, [maskq+16*7], 1
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xm2, xm2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+ALIGN function_align
+.w4:
+ mova xm3, [blend_shuf]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xm2, xm3
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x03
+.w8_loop:
+ vpbroadcastq m1, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m1, 0x30
+ vpbroadcastd m3, [maskq+hq*2]
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb m3, m4
+ punpcklbw m0, m1
+ pmaddubsw m0, m3
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32: ; w32/w64/w128
+ sub dsq, r6
+.w32_loop0:
+ vpbroadcastw m3, [maskq+hq*2]
+ mov wd, r6d
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 32
+ sub wd, 32
+ jg .w32_loop
+ add dstq, dsq
+ inc hq
+ jl .w32_loop0
+ RET
+
+cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ add srcq, r10
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastb m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, 32
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3]
+%if %1
+ movu [r12+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, 32
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ add r12, centerwq
+%else
+ lea r12, [dstq+centerwq]
+%endif
+ xor r3, r3
+ vpbroadcastb m0, [srcq+centerwq-1]
+.right_loop_%3:
+ movu [r12+r3], m0
+ add r3, 32
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+
+ vpbroadcastd xm3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti128 m15, [base+pb_8x0_8x8]
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+ pxor m2, m2
+
+ ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+ ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m7 ; filter offset (masked)
+
+ ; load source pixels - this ugly code is vpgatherdq emulation since
+ ; directly using vpgatherdq on Haswell is quite a bit slower :(
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movq xm12, [srcq+r8]
+ movq xm13, [srcq+r10]
+ movhps xm12, [srcq+r9]
+ movhps xm13, [srcq+r11]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m12, [srcq+r8], 1
+ vinserti128 m13, [srcq+r10], 1
+ vpbroadcastq m10, [srcq+r9]
+ vpbroadcastq m11, [srcq+r11]
+ vpblendd m12, m10, 11000000b
+ vpblendd m13, m11, 11000000b
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ vptest m1, m1
+ jz .filter
+
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movq xm14, [base+resize_shuf+4+r8]
+ movq xm0, [base+resize_shuf+4+r10]
+ movhps xm14, [base+resize_shuf+4+r9]
+ movhps xm0, [base+resize_shuf+4+r11]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+4+r8], 1
+ vinserti128 m0, [base+resize_shuf+4+r10], 1
+ vpbroadcastq m10, [base+resize_shuf+4+r9]
+ vpbroadcastq m11, [base+resize_shuf+4+r11]
+ vpblendd m14, m10, 11000000b
+ vpblendd m0, m11, 11000000b
+
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m12, m14
+ pshufb m13, m0
+
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm10, [base+resize_filter+r8*8]
+ movq xm11, [base+resize_filter+r10*8]
+ movhps xm10, [base+resize_filter+r9*8]
+ movhps xm11, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vinserti128 m10, [base+resize_filter+r8*8], 1
+ vinserti128 m11, [base+resize_filter+r10*8], 1
+ vpbroadcastq m14, [base+resize_filter+r9*8]
+ vpbroadcastq m1, [base+resize_filter+r11*8]
+ vpblendd m10, m14, 11000000b
+ vpblendd m11, m1, 11000000b
+
+ pmaddubsw m12, m10
+ pmaddubsw m13, m11
+ phaddw m12, m13
+ vextracti128 xm13, m12, 1
+ phaddsw xm12, xm13
+ pmulhrsw xm12, xm3 ; x=(x+64)>>7
+ packuswb xm12, xm12
+ movq [dstq+xq], xm12
+
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m9, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign
+ add wq, r7
+ W_MASK 0, 4, 0, 1
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm0, m4, 1
+ vpblendd xm1, xm4, xm0, 0x05
+ vpblendd xm4, xm0, 0x0a
+ pshufd xm1, xm1, q2301
+ psubw xm4, xm8, xm4
+ psubw xm4, xm1
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [maskq], xm4
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ phaddd m4, m5
+ vextracti128 xm1, m0, 1
+ psubw m4, m8, m4
+ psrlw m4, 2
+ vpermd m4, m9, m4
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], xm4
+ RET
+.w8_loop:
+ add tmp1q, 2*32
+ add tmp2q, 2*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 8
+.w8:
+ vextracti128 xm2, m4, 1
+ vextracti128 xm1, m0, 1
+ psubw xm4, xm8, xm4
+ psubw xm4, xm2
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ movq [maskq], xm4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ punpckhqdq m1, m4, m5
+ punpcklqdq m4, m5
+ psubw m1, m8, m1
+ psubw m1, m4
+ psrlw m1, 2
+ vpermq m0, m0, q3120
+ packuswb m1, m1
+ vpermd m1, m9, m1
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], xm1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ psubw m4, m8, m4
+ psubw m4, m5
+ psrlw m4, 2
+ vpermq m0, m0, q3120
+ packuswb m4, m4
+ vpermd m4, m9, m4
+ mova [dstq+strideq*1], m0
+ mova [maskq], xm4
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+ dec hd
+.w64_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ test hd, 1
+ jz .w64_loop_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq], m4
+ add maskq, 32
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop_even:
+ psubw m12, m8, m4
+ psubw m13, m8, m5
+ dec hd
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ add tmp1q, 8*32
+ add tmp2q, 8*32
+ test hd, 1
+ jz .w128_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*0], m4
+ jmp .w128_odd
+.w128_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+.w128_odd:
+ W_MASK 0, 4, -4, -3
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, -2, -1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ test hd, 1
+ jz .w128_loop_even
+ psubw m4, m12, m4
+ psubw m5, m13, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*1], m4
+ add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ pxor m9, m9
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m10, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign
+ add wq, r7
+ mov maskq, maskmp
+ W_MASK 0, 4, 0, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ mova [maskq], xm5
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermd m5, m10, m5
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], m5
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vextracti128 xm5, m4, 1
+ vextracti128 xm1, m0, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], xm5
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], m5
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*1], m0
+ mova [maskq], m5
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq], m5
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq+32*0], m5
+ W_MASK 0, 4, 4, 5
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, 6, 7
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*3], m0
+ mova [maskq+32*1], m5
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ add wq, r7
+ W_MASK 0, 4, 0, 1, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ mova [maskq+32*0], m4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ je .w4_end
+ W_MASK 0, 4, 2, 3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq+32*1], m4
+.w4_end:
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], m4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [maskq], m4
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ mova [maskq], m4
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*4
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ W_MASK 0, 4, 4, 5, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ mova [maskq+32*2], m4
+ W_MASK 0, 4, 6, 7, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ mova [maskq+32*3], m4
+ dec hd
+ jg .w128_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
new file mode 100644
index 0000000000..7897f1decc
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -0,0 +1,4538 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+obmc_masks:
+pw_512: times 2 dw 512
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20
+ db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22
+ db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24
+ db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
+warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24
+ db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26
+ db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28
+ db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30
+warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43
+pd_16384: dd 16384
+pd_262144: dd 262144
+warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54
+warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
+ db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
+bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+ db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+ db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
+ db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+ db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
+ db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+ db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+ db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+ db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+ db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+ db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+ db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+ db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
+ db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
+spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+ db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+ db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+ db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+ db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+ db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+ db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13
+ db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55
+ db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63
+ db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71
+ db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79
+spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
+ db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
+ db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
+ db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
+spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
+ db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
+spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
+ db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
+spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
+ db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
+ db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52
+ db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54
+spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40
+ db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42
+ db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48
+ db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50
+spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
+ db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
+ db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
+ db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
+spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12
+ db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14
+ db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20
+ db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22
+spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+resize_permC: dd 0, 4, 8, 12
+pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
+
+wm_420_perm64: dq 0xfedcba9876543210
+wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+pb_127: times 4 db 127
+pw_m128 times 2 dw -128
+pw_m256: times 2 dw -256
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_34: dd 34
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+
+%define pb_m64 (wm_sign+4)
+%define pb_64 (wm_sign+8)
+%define pd_2 (pd_0to7+8)
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+cextern mc_warp_filter
+cextern resize_filter
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro WRAP_YMM 1+
+INIT_YMM cpuname
+ %1
+INIT_ZMM cpuname
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xmm0, [srcq+ssq*0]
+ pinsrd xmm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xmm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ pmulhrsw ym0, ym3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ add srcq, ssq
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m6, [srcq+8*9]
+ add srcq, ssq
+ REPX {pshufb x, m4}, m0, m2, m1, m6
+ REPX {pmaddubsw x, m5}, m0, m2, m1, m6
+ REPX {pmulhrsw x, m3}, m0, m2, m1, m6
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16 << 8
+ add wq, r7
+ vpbroadcastw m4, mxyd
+ jmp wq
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xmm1, xmm1, q2301 ; 1 0
+ punpcklbw xmm1, xmm0, xmm1
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 1
+ pextrw [dstq+dsq*1], xmm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm1, xmm0, 0x02 ; 1 2
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xmm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xmm1, xmm3, xmm0
+ movq xmm0, [srcq+ssq*0]
+ punpcklbw xmm2, xmm0, xmm3
+ pmaddubsw xmm1, xm4
+ pmaddubsw xmm2, xm4
+ pmulhrsw xmm1, xm5
+ pmulhrsw xmm2, xm5
+ packuswb xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xmm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 ymm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2
+ punpcklbw ymm1, ymm2, ymm3
+ punpckhbw ymm2, ymm3
+ pmaddubsw ymm1, ym4
+ pmaddubsw ymm2, ym4
+ pmulhrsw ymm1, ym5
+ pmulhrsw ymm2, ym5
+ packuswb ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ vzeroupper
+ RET
+.v_w32:
+ movu ym0, [srcq+ssq*0]
+ kxnorb k1, k1, k1
+.v_w32_loop:
+ vbroadcasti32x8 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendmd m3{k1}, m2, m0 ; 0 1
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ vpblendmd m2{k1}, m0, m2 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0]
+.v_w64_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ punpckhbw m6, m3, m0
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m6, m4
+ punpcklbw m2, m0, m3
+ punpckhbw m7, m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m7, m4
+ REPX {pmulhrsw x, m5}, m1, m6, m2, m7
+ packuswb m1, m6
+ packuswb m2, m7
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w128_loop:
+ add srcq, ssq
+ movu m2, [srcq+64*0]
+ movu m3, [srcq+64*1]
+ punpcklbw m6, m2, m0
+ pmaddubsw m6, m4
+ punpckhbw m0, m2, m0
+ pmaddubsw m0, m4
+ punpcklbw m7, m3, m1
+ pmaddubsw m7, m4
+ punpckhbw m1, m3, m1
+ pmaddubsw m1, m4
+ REPX {pmulhrsw x, m5}, m6, m0, m7, m1
+ packuswb m6, m0
+ mova m0, m2
+ packuswb m7, m1
+ mova m1, m3
+ mova [dstq+64*0], m6
+ mova [dstq+64*1], m7
+ add dstq, dsq
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_2048]
+ add wq, r7
+ vpbroadcastw m6, mxyd
+ jmp wq
+.hv_w2:
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+.hv_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xmm1, [srcq+ssq*0], 1
+ pshufb xmm1, xm4
+ pmaddubsw xmm1, xm5 ; 1 _ 2 _
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 0
+ pextrw [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xmm4, [bilin_h_shuf4]
+ movddup xmm0, [srcq+ssq*0]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0]
+ pshufb xmm1, xmm4
+ pmaddubsw xmm1, xm5 ; 1 2
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 ym0, [srcq+ssq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym1, [srcq+ssq*0], 1
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2
+ valignq ym2, ym1, ym0, 2
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym6
+ paddw ym1, ym2
+ pmulhrsw ym1, ym7
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ mova m4, [bilin_h_perm16]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0], 1
+ vpermb m1, m4, m1
+ pmaddubsw m1, m5 ; 1 2
+ valignq m2, m1, m0, 4 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ vpmovuswb ym1, m1
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+ssq*0]
+ pmovzxbq m8, [pb_02461357]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m2, m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpermb m3, m4, [srcq+ssq*0]
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ vpermq m1, m8, m1
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ paddw m8, m8
+ pmulhw m8, m6
+ paddw m9, m9
+ pmulhw m9, m6
+ paddw m8, m0
+ pmulhrsw m8, m7
+ paddw m9, m1
+ pmulhrsw m9, m7
+ mova m0, m2
+ mova m1, m3
+ packuswb m8, m9
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ movu m2, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ REPX {pshufb x, m4}, m0, m1, m2, m3
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, ssq
+ movu m8, [srcq+8*0]
+ movu m9, [srcq+8*1]
+ movu m10, [srcq+8*8]
+ movu m11, [srcq+8*9]
+ REPX {pshufb x, m4}, m8, m9, m10, m11
+ REPX {pmaddubsw x, m5}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ paddw m12, m12
+ pmulhw m12, m6
+ paddw m13, m13
+ pmulhw m13, m6
+ paddw m14, m14
+ pmulhw m14, m6
+ paddw m15, m15
+ pmulhw m15, m6
+ paddw m12, m0
+ pmulhrsw m12, m7
+ paddw m13, m1
+ pmulhrsw m13, m7
+ paddw m14, m2
+ pmulhrsw m14, m7
+ paddw m15, m3
+ pmulhrsw m15, m7
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+ packuswb m12, m13
+ packuswb m14, m15
+ mova [dstq+64*0], m12
+ mova [dstq+64*1], m14
+ add dstq, dsq
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+DECLARE_REG_TMP 3, 5, 6
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xmm0, [srcq+strideq*0]
+ pinsrd xmm0, [srcq+strideq*1], 1
+ pinsrd xmm0, [srcq+strideq*2], 2
+ pinsrd xmm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw ym0, xmm0
+ psllw ym0, 4
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti128 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pmovzxbw m0, ym0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu xmm0, [srcq+strideq*0]
+ vinserti128 ym0, ymm0, [srcq+strideq*1], 1
+ movu xmm1, [srcq+strideq*2]
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, ym0
+ pmovzxbw m1, ym1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+strideq*0+32*0]
+ pmovzxbw m1, [srcq+strideq*0+32*1]
+ pmovzxbw m2, [srcq+strideq*1+32*0]
+ pmovzxbw m3, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+32*0]
+ pmovzxbw m1, [srcq+32*1]
+ pmovzxbw m2, [srcq+32*2]
+ pmovzxbw m3, [srcq+32*3]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+.h_w8_loop:
+ movu xmm0, [srcq+strideq*0]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpermb m0, m4, m0
+ vpermb m1, m4, m1
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m4, [bilin_h_perm32]
+.h_w32_loop:
+ vpermb m0, m4, [srcq+strideq*0]
+ vpermb m1, m4, [srcq+strideq*1]
+ vpermb m2, m4, [srcq+strideq*2]
+ vpermb m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .h_w32_loop
+ RET
+.h_w64:
+ mova m4, [bilin_h_perm32]
+.h_w64_loop:
+ vpermb m0, m4, [srcq+strideq*0+32*0]
+ vpermb m1, m4, [srcq+strideq*0+32*1]
+ vpermb m2, m4, [srcq+strideq*1+32*0]
+ vpermb m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .h_w64_loop
+ RET
+.h_w128:
+ mova m4, [bilin_h_perm32]
+.h_w128_loop:
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .h_w128_loop
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, mxyd
+ jmp wq
+.v_w4:
+ vpbroadcastd xm0, [srcq+strideq*0]
+ mov r3d, 0x29
+ vbroadcasti32x4 ym3, [bilin_v_shuf4]
+ kmovb k1, r3d
+.v_w4_loop:
+ vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+ vpbroadcastd ym2, [srcq+strideq*2]
+ vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ym0, [srcq+strideq*0]
+ punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
+ pshufb ym2, ym3
+ pmaddubsw ym2, ym6
+ mova [tmpq], ym2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ mova m5, [bilin_v_perm8]
+ vbroadcasti32x4 ym0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpbroadcastq ym0, [srcq+strideq*2]
+ vinserti32x4 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym0, [srcq+strideq*0], 0
+ vpermt2b m1, m5, m0
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m5, [bilin_v_perm16]
+ movu xm0, [srcq+strideq*0]
+.v_w16_loop:
+ movu xm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpermt2b m1, m5, m2
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ vpermt2b m2, m5, m0
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m5, [bilin_v_perm32]
+ movu ym0, [srcq+strideq*0]
+.v_w32_loop:
+ movu ym2, [srcq+strideq*1]
+ movu ym3, [srcq+strideq*2]
+ movu ym4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpermt2b m0, m5, m2
+ vpermt2b m2, m5, m3
+ vpermt2b m3, m5, m4
+ pmaddubsw m1, m0, m6
+ movu ym0, [srcq+strideq*0]
+ vpermt2b m4, m5, m0
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m4
+ add tmpq, 64*4
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+ vpermq m1, m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m1, m0
+ punpckhbw m2, m1, m0
+ vpermq m0, m5, [srcq+strideq*0]
+ punpcklbw m3, m0, m1
+ punpckhbw m1, m0, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m1
+ add tmpq, 64*4
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+ vpermq m2, m5, [srcq+strideq*1+ 0]
+ vpermq m3, m5, [srcq+strideq*1+64]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m0, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m0
+ punpcklbw m4, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*2], m4
+ mova [tmpq+64*3], m1
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+ punpcklbw m4, m0, m2
+ punpckhbw m2, m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*4], m4
+ mova [tmpq+64*5], m2
+ punpcklbw m4, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m3, m6
+ mova [tmpq+64*6], m4
+ mova [tmpq+64*7], m3
+ add tmpq, 64*8
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m6, mxyd
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+ vpbroadcastq ym0, [srcq+strideq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w4_loop:
+ movq xmm1, [srcq+strideq*1]
+ movq xmm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1
+ punpcklqdq ym1, ym2
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym6
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xmm1, [srcq+strideq*1]
+ vinserti128 ym1, ymm1, [srcq+strideq*2], 1
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m1, [srcq+strideq*0], 3
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mova m4, [bilin_h_perm16]
+ vbroadcasti32x8 m0, [srcq+strideq*0]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1]
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ movu ym2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti32x8 m2, [srcq+strideq*0], 1
+ vpermb m1, m4, m1
+ vpermb m2, m4, m2
+ pmaddubsw m1, m5 ; 1 2
+ vshufi32x4 m3, m0, m1, q1032 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vshufi32x4 m2, m1, m0, q1032 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ sub hd, 4
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+strideq*0]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m1, m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermb m2, m4, [srcq+strideq*0]
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+64*0], m3
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, strideq
+ vpermb m2, m4, [srcq+32*0]
+ vpermb m3, m4, [srcq+32*1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m7, m2, m0
+ psubw m8, m3, m1
+ pmulhrsw m7, m6
+ pmulhrsw m8, m6
+ paddw m7, m0
+ mova m0, m2
+ paddw m8, m1
+ mova m1, m3
+ mova [tmpq+64*0], m7
+ mova [tmpq+64*1], m8
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ vpermb m7, m4, [srcq+32*0]
+ vpermb m8, m4, [srcq+32*1]
+ vpermb m9, m4, [srcq+32*2]
+ vpermb m10, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m7, m8, m9, m10
+ psubw m11, m7, m0
+ psubw m12, m8, m1
+ psubw m13, m9, m2
+ psubw m14, m10, m3
+ REPX {pmulhrsw x, m6}, m11, m12, m13, m14
+ paddw m11, m0
+ mova m0, m7
+ paddw m12, m1
+ mova m1, m8
+ paddw m13, m2
+ mova m2, m9
+ paddw m14, m3
+ mova m3, m10
+ mova [tmpq+64*0], m11
+ mova [tmpq+64*1], m12
+ mova [tmpq+64*2], m13
+ mova [tmpq+64*3], m14
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
+%if %5
+ vpermb m%2, m6, m%1
+ vpermb m%3, m7, m%1
+ vpermb m%4, m8, m%1
+%else
+%if %2 < %4 ; reuse a previous value if possible
+ pshufb m%2, m%1, m6
+%endif
+ pshufb m%3, m%1, m7
+ pshufb m%4, m%1, m8
+%endif
+ mova m%1, m5
+ vpdpbusd m%1, m%2, m9
+ mova m%2, m5
+ vpdpbusd m%2, m%3, m9
+ vpdpbusd m%1, m%3, m10
+ vpdpbusd m%2, m%4, m10
+ packusdw m%1, m%2
+ psrlw m%1, 6
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
+ vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xmm4, [subpel_h_shuf4]
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w2_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ mova xmm1, xm5
+ vpdpbusd xmm1, xmm0, xmm3
+ packssdw xmm0, xmm1, xmm1
+ psraw xmm0, 6
+ packuswb xmm0, xm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm6
+ pshufb xmm1, xm6
+ mova xmm2, xm5
+ vpdpbusd xmm2, xmm0, xmm3
+ mova xmm0, xm5
+ vpdpbusd xmm0, xmm1, xmm3
+ packssdw xmm0, xmm2, xmm0
+ psraw xmm0, 6
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m6, [spel_h_perm16a]
+ mova m7, [spel_h_perm16b]
+ mova m8, [spel_h_perm16c]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ add srcq, ssq
+ PUT_8TAP_H 0, 4, 11, 12
+ PUT_8TAP_H 2, 12, 11, 4
+ PUT_8TAP_H 1, 4, 11, 12
+ PUT_8TAP_H 3, 12, 11, 4
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [base+subpel_filters+myq*8]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrw xmm2, [srcq+ssq*1], 2
+ pinsrw xmm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm5, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklbw ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm5, 0x30
+ vpblendd ymm5, ymm3, 0x30
+ punpcklbw ymm2, ymm5 ; 23 34
+ vpblendd ymm3, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 45 56
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm5, ymm1, ym8 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm2, ym9 ; a1 b1
+ paddw ymm5, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym10 ; a2 b2
+ paddw ymm5, ymm3
+ vpblendd ymm3, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 67 78
+ pmaddubsw ymm4, ymm3, ym11 ; a3 b3
+ paddw ymm5, ymm4
+ pmulhrsw ymm5, ym7
+ vextracti128 xmm4, ymm5, 1
+ packuswb xmm5, xmm4
+ movq [dstq+dsq*0], xmm5
+ movhps [dstq+dsq*1], xmm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ mova m12, [spel_v_perm16]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 m3, [srcq+ssq*1]
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
+.v_w16_loop:
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1
+ mova m2, m3
+ pmaddubsw m6, m3, m10 ; a2 b2
+ mova m3, m0
+ paddw m4, m5
+ vbroadcasti32x4 ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m3{k1}, m5, m0, 0xcc
+ vpermb m3, m12, m3 ; 67 78
+ pmaddubsw m5, m3, m11 ; a3 b3
+ paddw m4, m6
+ paddw m4, m5
+ pmulhrsw m4, m7
+ vextracti32x8 ym5, m4, 1
+ packuswb ym4, ym5
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m12, [spel_v_perm32]
+ pmovzxbq m14, [pb_02461357]
+ vpshrdw m13, m12, m12, 8
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ vpermb m1, m12, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*2], 0
+ add srcq, ss3q
+ vpermb m2, m13, m0 ; 12
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ vpermb m3, m12, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*1], 0
+ vpermb m4, m13, m0 ; 34
+ vinserti32x8 m0, [srcq+ssq*2], 1
+ add srcq, ss3q
+ vpermb m5, m12, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m6, m13, m0 ; 56
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m15, m1, m8
+ mova m1, m3
+ pmaddubsw m16, m2, m8
+ mova m2, m4
+ pmaddubsw m17, m3, m9
+ mova m3, m5
+ pmaddubsw m18, m4, m9
+ mova m4, m6
+ pmaddubsw m19, m5, m10
+ vpermb m5, m12, m0 ; 67
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m20, m6, m10
+ vpermb m6, m13, m0 ; 78
+ paddw m15, m17
+ pmaddubsw m17, m5, m11
+ paddw m16, m18
+ pmaddubsw m18, m6, m11
+ paddw m15, m19
+ paddw m16, m20
+ paddw m15, m17
+ paddw m16, m18
+ pmulhrsw m15, m7
+ pmulhrsw m16, m7
+ packuswb m15, m16
+ vpermq m15, m14, m15
+ mova [dstq+dsq*0], ym15
+ vextracti32x8 [dstq+dsq*1], m15, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+.v_w128:
+ lea r6d, [hq+wq*4-256]
+ mov r4, srcq
+ mov r7, dstq
+.v_loop0:
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m13, [srcq+ssq*0]
+ movu m15, [srcq+ssq*1]
+ movu m17, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m6 ; 12l
+ punpckhbw m4, m6 ; 12h
+ punpcklbw m5, m6, m13 ; 23l
+ punpckhbw m6, m13 ; 23h
+ punpcklbw m12, m13, m15 ; 34l
+ punpckhbw m13, m15 ; 34h
+ punpcklbw m14, m15, m17 ; 45l
+ punpckhbw m15, m17 ; 45h
+ punpcklbw m16, m17, m0 ; 56l
+ punpckhbw m17, m0 ; 56h
+.v_loop:
+ pmaddubsw m18, m1, m8 ; a0l
+ mova m1, m5
+ pmaddubsw m19, m2, m8 ; a0h
+ mova m2, m6
+ pmaddubsw m20, m3, m8 ; b0l
+ mova m3, m12
+ pmaddubsw m21, m4, m8 ; b0h
+ mova m4, m13
+ pmaddubsw m5, m9 ; a1l
+ pmaddubsw m6, m9 ; a1h
+ pmaddubsw m12, m9 ; b1l
+ pmaddubsw m13, m9 ; b1h
+ paddw m18, m5
+ mova m5, m14
+ pmaddubsw m14, m10 ; a2l
+ paddw m19, m6
+ mova m6, m15
+ pmaddubsw m15, m10 ; a2h
+ paddw m20, m12
+ mova m12, m16
+ pmaddubsw m16, m10 ; b2l
+ paddw m21, m13
+ mova m13, m17
+ pmaddubsw m17, m10 ; b2h
+ paddw m18, m14
+ paddw m19, m15
+ paddw m20, m16
+ paddw m21, m17
+ movu m17, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m14, m0, m17 ; 67l
+ punpckhbw m15, m0, m17 ; 67h
+ pmaddubsw m16, m14, m11 ; a3l
+ pmaddubsw m0, m15, m11 ; a3h
+ paddw m18, m16
+ paddw m19, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m16, m17, m0 ; 78l
+ punpckhbw m17, m0 ; 78h
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*0], m18
+ pmaddubsw m18, m16, m11 ; b3l
+ pmaddubsw m19, m17, m11 ; b3h
+ paddw m18, m20
+ paddw m19, m21
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*1], m18
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 256
+ jg .v_loop0
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq ym0, [base+subpel_filters+myq*8]
+ lea ss3q, [ssq*3]
+ vpbroadcastd ym9, [pd_32768]
+ mov r6, srcq
+ punpcklbw ym0, ym8, ym0
+ sub r6, ss3q
+ psraw ym0, 2 ; << 6
+ mova xm14, [spel_hv_end]
+ pshufd ym10, ym0, q0000
+ pshufd ym11, ym0, q1111
+ pshufd ym12, ym0, q2222
+ pshufd ym13, ym0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 ym6, [subpel_h_shuf4]
+ movq xmm2, [r6+ssq*0]
+ movhps xmm2, [r6+ssq*1]
+ movq xmm0, [r6+ssq*2]
+ movhps xmm0, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm1, [srcq+ssq*0]
+ vpblendd ymm2, ymm3, 0x30
+ vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _
+ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5
+ pshufb ymm2, ym6
+ pshufb ymm0, ym6
+ mova ymm1, ym8
+ vpdpbusd ymm1, ymm2, ym7
+ mova ymm2, ym8
+ vpdpbusd ymm2, ymm0, ym7
+ packssdw ymm2, ymm1, ymm2
+ psraw ymm2, 2
+ vextracti128 xmm3, ymm2, 1
+ palignr xmm4, xmm3, xmm2, 4
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+ pshufd xmm0, xmm3, q2121
+ punpcklwd xmm3, xmm0 ; 45 56
+.hv_w2_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm4, [srcq+ssq*0]
+ mova xmm5, xm9
+ vpdpwssd xmm5, xmm1, xm10 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xmm2, xm11 ; a1 b1
+ pshufb xmm4, xm6
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xmm3, xm12 ; a2 b2
+ mova xmm3, xm8
+ vpdpbusd xmm3, xmm4, xm7
+ packssdw xmm4, xmm3, xmm3
+ psraw xmm4, 2
+ palignr xmm3, xmm4, xmm0, 12
+ mova xmm0, xmm4
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xmm3, xm13 ; a3 b3
+ packuswb xmm5, xmm5
+ pshufb xmm5, xm14
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ vzeroupper
+ RET
+.hv_w4:
+ movq xmm1, [r6+ssq*0]
+ vpbroadcastq ym2, [r6+ssq*1]
+ vinserti32x4 ym1, ymm1, [r6+ssq*2], 1
+ vinserti32x4 m2, [srcq+ssq*0], 2
+ vinserti32x4 m1, [srcq+ssq*1], 2
+ vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5
+ vbroadcasti32x4 m6, [subpel_h_shufA]
+ add srcq, ss3q
+ vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6
+ pshufb m2, m6
+ pshufb m1, m6
+ mova m0, m8
+ vpdpbusd m0, m2, m7
+ mova m4, m8
+ vpdpbusd m4, m1, m7
+ mova ym1, [spel_hv_perm4a]
+ mova ym2, [spel_hv_perm4b]
+ mova ym3, [spel_hv_perm4c]
+ packssdw m0, m4
+ psraw m0, 2 ; _ 0 1 2 3 4 5 6
+ mov r6d, 0x5555
+ vpermb ym1, ym1, ym0 ; 01 12
+ vpermb m2, m2, m0 ; 23 34
+ vpermb m3, m3, m0 ; 45 56
+ kmovw k1, r6d
+ mova ym15, [spel_hv_perm4d]
+.hv_w4_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1
+ mova ym5, ym9
+ vpdpwssd ym5, ym1, ym10 ; a0 b0
+ mova ym1, ym2
+ pshufb ym4, ym6
+ mova ym0, ym8
+ vpdpbusd ym0, ym4, ym7
+ vpdpwssd ym5, ym2, ym11 ; a1 b1
+ mova ym2, ym3
+ vpdpwssd ym5, ym3, ym12 ; a2 b2
+ vpsraw ym3{k1}, ym0, 2 ; 7 8
+ vpermb ym3, ym15, ym3 ; 67 78
+ vpdpwssd ym5, ym3, ym13 ; a3 b3
+ packuswb ym5, ym5
+ vpermb ym5, ym14, ym5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ vpbroadcastd m9, [pd_32768]
+ punpcklbw m0, m8, m0
+ lea ss3q, [ssq*3]
+ psraw m0, 2 ; << 6
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ cmp wd, 8
+ jne .hv_w16
+ mov r6, srcq
+ sub r6, ss3q
+ movu xmm1, [r6+ssq*0]
+ vinserti128 ymm1, [r6+ssq*1], 1
+ movu xmm2, [srcq+ssq*1]
+ vinserti32x4 m6, zmm1, [r6+ssq*2], 2
+ vinserti128 ymm2, [srcq+ssq*2], 1
+ vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3
+ add srcq, ss3q
+ vbroadcasti32x4 m4, [subpel_h_shufA]
+ vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
+ vbroadcasti32x4 m7, [subpel_h_shufB]
+ vbroadcasti32x4 m17, [subpel_h_shufC]
+ pshufb m1, m6, m4 ; 0 1 2 3 0123
+ mova m2, m8
+ vpdpbusd m2, m1, m10
+ pshufb m5, m6, m7 ; 0 1 2 3 4567
+ mova m1, m8
+ vpdpbusd m1, m5, m10
+ pshufb m4, m0, m4 ; 4 5 6 _ 0123
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ pshufb m7, m0, m7 ; 4 5 6 _ 4567
+ mova m4, m8
+ vpdpbusd m4, m7, m10
+ pshufb m6, m17
+ vpdpbusd m2, m5, m11
+ vpdpbusd m1, m6, m11
+ pshufb m6, m0, m17
+ vpdpbusd m3, m7, m11
+ vpdpbusd m4, m6, m11
+ mova m5, [spel_hv_perm8a]
+ mova m0, [spel_hv_perm8b]
+ mov r6, 0x55555555ff00
+ packssdw m2, m1
+ packssdw m3, m4
+ mova m18, [spel_hv_perm8c]
+ psraw m2, 2 ; 0 1 2 3
+ psraw m3, 2 ; 4 5 6 _
+ vpermb m1, m5, m2 ; 01 12
+ vbroadcasti32x8 m6, [subpel_h_shufA]
+ kmovq k1, r6
+ vpermt2b m2, m0, m3 ; 23 34
+ vbroadcasti32x8 m7, [subpel_h_shufB]
+ kshiftrq k2, k1, 16
+ mova xm16, [spel_hv_end]
+ vpermb m3, m5, m3 ; 45 56
+.hv_w8_loop:
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m4{k1}, [srcq+ssq*0]
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0 b0
+ pshufb m1, m4, m6 ; 7 8 0123 4567
+ mova m5, m8
+ vpdpbusd m5, m1, m10
+ pshufb m4, m7 ; 7 8 4567 89ab
+ vpdpwssd m0, m2, m13 ; a1 b1
+ mova m1, m2
+ vpdpbusd m5, m4, m11
+ mova m2, m3
+ vpdpwssd m0, m3, m14 ; a2 b2
+ psraw m3{k2}, m5, 2 ; 75 86
+ vpermb m3, m18, m3 ; 67 78
+ vpdpwssd m0, m3, m15 ; a3 b3
+ packuswb m0, m0
+ vpermb zmm1, m16, m0
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ movu m7, [spel_hv_perm16a]
+ sub srcq, ss3q
+ mova m20, [spel_hv_perm16b]
+ lea r6d, [wq*2-32]
+ mova m21, [spel_hv_perm16c]
+ mov r4, srcq
+ mov r7, dstq
+ mova ym16, [spel_hv_end16]
+ lea r6d, [hq+r6*8]
+.hv_w16_loop0:
+ movu ym17, [srcq+ssq*0]
+ vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1
+ movu ym18, [srcq+ssq*2]
+ add srcq, ss3q
+ vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3
+ movu ym19, [srcq+ssq*1]
+ vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5
+ add srcq, ss3q
+ vpermb m2, m7, m17 ; 0 1 0123 89ab
+ vpermb m0, m20, m17 ; 0 1 4567 cdef
+ vpermb m4, m7, m18 ; 2 3 0123 89ab
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpermb m5, m20, m18 ; 2 3 4567 cdef
+ mova m2, m8
+ vpdpbusd m2, m0, m10
+ vpermb m17, m21, m17 ; 0 1 89ab ghij
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ vpermb m6, m7, m19 ; 4 5 0123 89ab
+ mova m4, m8
+ vpdpbusd m4, m5, m10
+ vpermb m18, m21, m18 ; 2 3 89ab ghij
+ vpdpbusd m1, m0, m11
+ movu ym0, [srcq+ssq*0] ; 6
+ vpdpbusd m2, m17, m11
+ vpermb m17, m20, m19 ; 4 5 4567 cdef
+ vpdpbusd m3, m5, m11
+ mova m5, m8
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ vpdpbusd m6, m17, m10
+ vpdpbusd m4, m18, m11
+ mova m18, [spel_hv_perm16d]
+ vpermb m18, m18, m0 ; 6 0145 2367 89cd abef
+ vpdpbusd m5, m17, m11
+ vpermb m19, m21, m19 ; 4 5 89ab ghij
+ mova m17, m8
+ vpdpbusd m17, m18, m10
+ mova m18, [spel_hv_perm16e]
+ vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij
+ packssdw m1, m2 ; 01
+ vpdpbusd m6, m19, m11
+ packssdw m3, m4 ; 23
+ vpdpbusd m17, m0, m11
+ psraw m1, 2
+ packssdw m5, m6 ; 45
+ psraw m3, 2
+ vpshrdd m2, m1, m3, 16 ; 12
+ psraw m5, 2
+ vpshrdd m4, m3, m5, 16 ; 34
+ psraw m17, 2
+ vpshrdd m6, m5, m17, 16 ; 56
+.hv_w16_loop:
+ movu ym18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m18, [srcq+ssq*0], 1
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0
+ vpermb m1, m7, m18 ; 7 8 0123 89ab
+ mova m17, m9
+ vpdpwssd m17, m2, m12 ; b0
+ vpermb m2, m20, m18 ; 7 8 4567 cdef
+ mova m19, m8
+ vpdpbusd m19, m1, m10
+ vpermb m18, m21, m18
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpdpwssd m0, m3, m13 ; a1
+ vpdpwssd m17, m4, m13 ; b1
+ vpdpbusd m19, m2, m11
+ mova m2, m4
+ vpdpbusd m1, m18, m11
+ mova m4, m6
+ vpdpwssd m0, m5, m14 ; a2
+ vpdpwssd m17, m6, m14 ; b2
+ packssdw m19, m1
+ mova m1, m3
+ mova m3, m5
+ psraw m6, m19, 2 ; 7 8
+ vpshrdd m5, m4, m6, 16 ; 6 7
+ vpdpwssd m17, m6, m15 ; b3
+ vpdpwssd m0, m5, m15 ; a3
+ packuswb m0, m17
+ vpermb zmm1, m16, m0
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
+ RET
+
+%macro PREP_8TAP_H 0
+ vpermb m10, m5, m0
+ vpermb m11, m5, m1
+ vpermb m12, m6, m0
+ vpermb m13, m6, m1
+ vpermb m14, m7, m0
+ vpermb m15, m7, m1
+ mova m0, m4
+ vpdpbusd m0, m10, m8
+ mova m2, m4
+ vpdpbusd m2, m12, m8
+ mova m1, m4
+ vpdpbusd m1, m11, m8
+ mova m3, m4
+ vpdpbusd m3, m13, m8
+ vpdpbusd m0, m12, m9
+ vpdpbusd m2, m14, m9
+ vpdpbusd m1, m13, m9
+ vpdpbusd m3, m15, m9
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pd_2]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16a]
+ mova m6, [spel_h_perm16b]
+ mova m7, [spel_h_perm16c]
+ lea stride3q, [strideq*3]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r6+32*0]
+ movu m1, [srcq+r6+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r6, 64
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ tzcnt wd, wd
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+ add wq, r7
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ jmp wq
+.v_w4:
+ movd xmm0, [srcq+strideq*0]
+ vpbroadcastd ymm1, [srcq+strideq*2]
+ vpbroadcastd xmm2, [srcq+strideq*1]
+ vpbroadcastd ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd ymm0, [srcq+strideq*0]
+ vpbroadcastd ymm2, [srcq+strideq*1]
+ vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34
+ vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw ymm2, ymm3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xmm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ymm3, [srcq+strideq*0]
+ vpbroadcastd ymm4, [srcq+strideq*1]
+ vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb ymm3, ymm5 ; 67 78 89 9a
+ pmaddubsw ymm4, ymm1, ym8
+ vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78
+ pmaddubsw ymm2, ym9
+ paddw ymm4, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym11
+ paddw ymm3, ymm4
+ pmaddubsw ymm4, ymm1, ym10
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym7
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mov r3d, 0xf044
+ kmovw k1, r3d
+ kshiftrw k2, k1, 8
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq ym1, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpbroadcastq m6, [srcq+strideq*2]
+ vmovdqa64 ym0{k1}, ym1
+ vmovdqa64 ym1{k1}, ym2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ punpcklbw ym0, ym1 ; 01 12 __ __
+ punpcklbw m2, m3 ; 23 34 23 34
+ punpcklbw m4, m5 ; 45 56 45 56
+ vmovdqa64 m0{k2}, m2 ; 01 12 23 34
+ vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m15, m2, m9
+ vpblendmq m0{k1}, m6, m1
+ vpblendmq m2{k1}, m1, m3
+ vpbroadcastq m6, [srcq+strideq*2]
+ paddw m14, m15
+ punpcklbw m2, m0, m2 ; 67 78 67 78
+ vpblendmq m12{k1}, m3, m5
+ vpblendmq m13{k1}, m5, m6
+ vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
+ punpcklbw m4, m12, m13 ; 89 9a 89 9a
+ vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
+ pmaddubsw m12, m0, m10
+ pmaddubsw m13, m2, m11
+ paddw m14, m12
+ paddw m14, m13
+ pmulhrsw m14, m7
+ mova [tmpq], m14
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mov r3d, 0xf0
+ kmovb k1, r3d
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ vmovdqa64 m0{k1}, m1
+ vmovdqa64 m1{k1}, m2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+ shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+ shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+ shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+ punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
+ punpcklbw m0, m1 ; 01a 01b 12a 12b
+ punpcklbw m4, m5 ; 45a 45b 56a 56b
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m5, [srcq+strideq*0]
+ vpblendmq m1{k1}, m6, m3
+ vmovdqa64 m3{k1}, m5
+ pmaddubsw m12, m0, m8
+ pmaddubsw m13, m2, m8
+ pmaddubsw m14, m2, m9
+ pmaddubsw m15, m4, m9
+ pmaddubsw m0, m4, m10
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ paddw m12, m14
+ paddw m13, m15
+ paddw m12, m0
+ vmovdqa64 m5{k1}, m2
+ vmovdqa64 m2{k1}, m6
+ mova m0, m4
+ shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+ shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+ punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
+ punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
+ pmaddubsw m14, m2, m10
+ pmaddubsw m15, m2, m11
+ paddw m13, m14
+ paddw m12, m15
+ pmaddubsw m14, m4, m11
+ paddw m13, m14
+ pmulhrsw m12, m7
+ pmulhrsw m13, m7
+ mova [tmpq+ 0], m12
+ mova [tmpq+64], m13
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m18, [bilin_v_perm64]
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ movu ym3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym4, [srcq+strideq*0]
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym6, [srcq+strideq*0]
+ vpermq m0, m18, m0
+ vpermq m1, m18, m1
+ vpermq m2, m18, m2
+ vpermq m3, m18, m3
+ vpermq m4, m18, m4
+ vpermq m5, m18, m5
+ vpermq m6, m18, m6
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+ punpcklbw m3, m4
+ punpcklbw m4, m5
+ punpcklbw m5, m6
+.v_w32_loop:
+ movu ym12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym13, [srcq+strideq*0]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m16, m2, m9
+ pmaddubsw m15, m1, m8
+ pmaddubsw m17, m3, m9
+ mova m0, m2
+ mova m1, m3
+ vpermq m12, m18, m12
+ vpermq m13, m18, m13
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m4, m10
+ pmaddubsw m17, m5, m10
+ punpcklbw m6, m12
+ punpcklbw m12, m13
+ mova m2, m4
+ mova m3, m5
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m6, m11
+ pmaddubsw m17, m12, m11
+ mova m4, m6
+ mova m5, m12
+ paddw m14, m16
+ paddw m15, m17
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova m6, m13
+ mova [tmpq+ 0], m14
+ mova [tmpq+64], m15
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+ mov wd, 64
+ jmp .v_start
+.v_w128:
+ mov wd, 128
+.v_start:
+ WIN64_SPILL_XMM 27
+ mova m26, [bilin_v_perm64]
+ lea r6d, [hq+wq*2]
+ mov r5, srcq
+ mov r7, tmpq
+.v_loop0:
+ vpermq m0, m26, [srcq+strideq*0]
+ vpermq m1, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m2, m26, [srcq+strideq*0]
+ vpermq m3, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m4, m26, [srcq+strideq*0]
+ vpermq m5, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m6, m26, [srcq+strideq*0]
+ punpckhbw m12, m0, m1
+ punpcklbw m0, m1
+ punpckhbw m13, m1, m2
+ punpcklbw m1, m2
+ punpckhbw m14, m2, m3
+ punpcklbw m2, m3
+ punpckhbw m15, m3, m4
+ punpcklbw m3, m4
+ punpckhbw m16, m4, m5
+ punpcklbw m4, m5
+ punpckhbw m17, m5, m6
+ punpcklbw m5, m6
+.v_loop:
+ vpermq m18, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m19, m26, [srcq+strideq*0]
+ pmaddubsw m20, m0, m8
+ pmaddubsw m21, m12, m8
+ pmaddubsw m22, m1, m8
+ pmaddubsw m23, m13, m8
+ mova m0, m2
+ mova m12, m14
+ mova m1, m3
+ mova m13, m15
+ pmaddubsw m2, m9
+ pmaddubsw m14, m9
+ pmaddubsw m3, m9
+ pmaddubsw m15, m9
+ punpckhbw m24, m6, m18
+ punpcklbw m6, m18
+ paddw m20, m2
+ paddw m21, m14
+ paddw m22, m3
+ paddw m23, m15
+ mova m2, m4
+ mova m14, m16
+ mova m3, m5
+ mova m15, m17
+ pmaddubsw m4, m10
+ pmaddubsw m16, m10
+ pmaddubsw m5, m10
+ pmaddubsw m17, m10
+ punpckhbw m25, m18, m19
+ punpcklbw m18, m19
+ paddw m20, m4
+ paddw m21, m16
+ paddw m22, m5
+ paddw m23, m17
+ mova m4, m6
+ mova m16, m24
+ mova m5, m18
+ mova m17, m25
+ pmaddubsw m6, m11
+ pmaddubsw m24, m11
+ pmaddubsw m18, m11
+ pmaddubsw m25, m11
+ paddw m20, m6
+ paddw m21, m24
+ paddw m22, m18
+ paddw m23, m25
+ pmulhrsw m20, m7
+ pmulhrsw m21, m7
+ pmulhrsw m22, m7
+ pmulhrsw m23, m7
+ mova m6, m19
+ mova [tmpq+wq*0+ 0], m20
+ mova [tmpq+wq*0+64], m21
+ mova [tmpq+wq*2+ 0], m22
+ mova [tmpq+wq*2+64], m23
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 64
+ add r7, 128
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt wd, wd
+ vpbroadcastd m8, [pd_2]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+ vpbroadcastd m9, [pd_32]
+ add wq, r7
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp wq
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mov r3d, 0x04
+ kmovb k1, r3d
+ kshiftlb k2, k1, 2
+ kshiftlb k3, k1, 4
+ vpbroadcastd m10, [pd_2]
+ vbroadcasti128 m16, [subpel_h_shufA]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ movq xm3, [srcq+strideq*0]
+ vpbroadcastq ym2, [srcq+strideq*1]
+ vpbroadcastq ym3{k1}, [srcq+strideq*2]
+ vpbroadcastq m2{k2}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3{k2}, [srcq+strideq*0]
+ vpbroadcastq m2{k3}, [srcq+strideq*1]
+ vpbroadcastq m3{k3}, [srcq+strideq*2]
+ mova m17, [spel_hv_perm4a]
+ movu m18, [spel_hv_perm4b]
+ mova m0, m10
+ mova m1, m10
+ pshufb m2, m16
+ pshufb m3, m16
+ vpdpbusd m0, m2, m8
+ vpdpbusd m1, m3, m8
+ packssdw m0, m1 ; _ 0 1 2 3 4 5 6
+ psraw m0, 2
+ vpermb m1, m17, m0 ; 01 12 23 34
+ vpermb m2, m18, m0 ; 23 34 45 56
+.hv_w4_loop:
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movq xm4, [srcq+strideq*0]
+ vpbroadcastq ym3{k1}, [srcq+strideq*1]
+ vpbroadcastq ym4{k1}, [srcq+strideq*2]
+ mova ym5, ym10
+ mova ym6, ym10
+ pshufb ym3, ym16
+ pshufb ym4, ym16
+ vpdpbusd ym5, ym3, ym8
+ vpdpbusd ym6, ym4, ym8
+ mova m7, m11
+ packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
+ psraw ym5, 2
+ valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
+ vpdpwssd m7, m1, m12
+ vpdpwssd m7, m2, m13
+ vpermb m1, m17, m0 ; 45 56 67 78
+ vpermb m2, m18, m0 ; 67 78 89 9a
+ vpdpwssd m7, m1, m14
+ vpdpwssd m7, m2, m15
+ psrad m7, 6
+ vpmovdw [tmpq], m7
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ WIN64_SPILL_XMM 24
+ vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m17, [subpel_h_shufB]
+ vbroadcasti128 m18, [subpel_h_shufC]
+ vinserti128 ym0, [srcq+strideq*0], 1
+ vinserti128 m0, [srcq+strideq*1], 2
+ vinserti128 m0, [srcq+strideq*2], 3
+ movu xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym1, [srcq+strideq*0], 1
+ vinserti128 m1, [srcq+strideq*1], 2
+ vinserti128 m1, [srcq+strideq*2], 3
+ mova m2, m8
+ mova m4, m8
+ mova m3, m8
+ mova m5, m8
+ pshufb m20, m0, m16
+ pshufb m21, m0, m17
+ pshufb m22, m0, m18
+ pshufb m23, m1, m16
+ pshufb m6, m1, m17
+ pshufb m7, m1, m18
+ vpdpbusd m2, m20, m10
+ vpdpbusd m4, m21, m10
+ vpdpbusd m2, m21, m11
+ vpdpbusd m4, m22, m11
+ vpdpbusd m3, m23, m10
+ vpdpbusd m5, m6, m10
+ vpdpbusd m3, m6, m11
+ vpdpbusd m5, m7, m11
+ packssdw m2, m4
+ packssdw m3, m5
+ psraw m2, 2 ; _ 0 1 2
+ psraw m3, 2 ; 3 4 5 6
+ valignq m0, m3, m2, 2 ; 0 1 2 3
+ valignq m1, m3, m2, 4 ; 1 2 3 4
+ valignq m2, m3, m2, 6 ; 2 3 4 5
+ punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
+ punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
+ punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
+ punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+ movu xm19, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym19, [srcq+strideq*0], 1
+ vinserti128 m19, [srcq+strideq*1], 2
+ vinserti128 m19, [srcq+strideq*2], 3
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m4, m12
+ vpdpwssd m21, m5, m12
+ vpdpwssd m20, m6, m13
+ vpdpwssd m21, m7, m13
+ pshufb m0, m19, m16
+ pshufb m1, m19, m17
+ pshufb m2, m19, m18
+ vpdpbusd m22, m0, m10
+ vpdpbusd m23, m1, m10
+ vpdpbusd m22, m1, m11
+ vpdpbusd m23, m2, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7 8 9 A
+ valignq m0, m22, m3, 2 ; 4 5 6 7
+ valignq m1, m22, m3, 4 ; 5 6 7 8
+ valignq m2, m22, m3, 6 ; 6 7 8 9
+ mova m3, m22
+ punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
+ punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
+ punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
+ punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq], m20
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mov wd, 16*2
+ jmp .hv_start
+.hv_w32:
+ mov wd, 32*2
+ jmp .hv_start
+.hv_w64:
+ mov wd, 64*2
+ jmp .hv_start
+.hv_w128:
+ mov wd, 128*2
+.hv_start:
+ WIN64_SPILL_XMM 31
+ mova m16, [spel_h_perm16a]
+ mova m17, [spel_h_perm16b]
+ mova m18, [spel_h_perm16c]
+ lea r6d, [hq+wq*8-256]
+ mov r5, srcq
+ mov r7, tmpq
+.hv_loop0:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym1, [srcq+strideq*0]
+ vinserti32x8 m1, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ vinserti32x8 m2, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym3, [srcq+strideq*0]
+ mova m4, m8
+ mova m5, m8
+ mova m6, m8
+ mova m7, m8
+ vpermb m19, m16, m0
+ vpermb m20, m17, m0
+ vpermb m21, m18, m0
+ vpermb m22, m16, m1
+ vpermb m23, m17, m1
+ vpermb m24, m18, m1
+ vpermb m25, m16, m2
+ vpermb m26, m17, m2
+ vpermb m27, m18, m2
+ vpermb ym28, ym16, ym3
+ vpermb ym29, ym17, ym3
+ vpermb ym30, ym18, ym3
+ mova m0, m8
+ mova m1, m8
+ mova ym2, ym8
+ mova ym3, ym8
+ vpdpbusd m4, m19, m10
+ vpdpbusd m5, m20, m10
+ vpdpbusd m6, m22, m10
+ vpdpbusd m7, m23, m10
+ vpdpbusd m0, m25, m10
+ vpdpbusd m1, m26, m10
+ vpdpbusd ym2, ym28, ym10
+ vpdpbusd ym3, ym29, ym10
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m21, m11
+ vpdpbusd m6, m23, m11
+ vpdpbusd m7, m24, m11
+ vpdpbusd m0, m26, m11
+ vpdpbusd m1, m27, m11
+ vpdpbusd ym2, ym29, ym11
+ vpdpbusd ym3, ym30, ym11
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw ym2, ym3
+ psraw m4, 2 ; 0a 0b 1a 1b
+ psraw m6, 2 ; 2a 2b 3a 3b
+ psraw m0, 2 ; 4a 4b 5a 5b
+ psraw ym2, 2 ; 6a 6b __ __
+ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
+ vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
+ vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
+ punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
+ punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
+ punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
+ punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
+ punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
+ punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+ movu ym19, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m19, [srcq+strideq*0], 1
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m2, m12
+ vpdpwssd m21, m3, m12
+ vpdpwssd m20, m4, m13
+ vpdpwssd m21, m5, m13
+ vpermb m24, m16, m19
+ vpermb m25, m17, m19
+ vpermb m26, m18, m19
+ vpdpbusd m22, m24, m10
+ vpdpbusd m23, m25, m10
+ vpdpbusd m22, m25, m11
+ vpdpbusd m23, m26, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7a 7b 8a 8b
+ vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ mova m2, m4
+ mova m3, m5
+ mova m1, m22
+ mova m4, m6
+ mova m5, m7
+ punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
+ punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq+wq*0], ym20
+ vextracti32x8 [tmpq+wq*1], m20, 1
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_loop0
+ RET
+
+cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
+ vpbroadcastd m9, [pd_16384]
+ mova ym15, [warp_8x8t_end]
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m16, m16
+ vpermb m16, m15, m16
+ mova [tmpq+tsq*0], xm16
+ vextracti128 [tmpq+tsq*2], ym16, 1
+ sub r6d, 0x1800
+ jg .loop
+ RET
+
+cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
+ vpbroadcastd m9, [pd_262144]
+ mova xm15, [warp_8x8_end]
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m16, 19
+ packuswb m16, m16
+ vpermb m16, m15, m16
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ sub r6d, 0x1800
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym1, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym1, r7m {1to8} ; my
+ mova ym16, [pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0]
+ vpbroadcastd ym21, [abcdq+4*1]
+ lea r4, [ssq*3+3]
+ mova m10, [warp_8x8_permA]
+ mov r6d, 0x5555
+ mova m11, [warp_8x8_permB]
+ lea filterq, [mc_warp_filter+64*8]
+ vpbroadcastq m12, [warp_8x8_hpack]
+ sub srcq, r4 ; src -= src_stride*3 + 3
+ vbroadcasti32x4 m13, [warp_8x8_permC]
+ kxnorb k2, k2, k2
+ vbroadcasti32x4 m14, [warp_8x8_permD]
+ vpdpwssd ym18, ym19, ym16 ; alpha
+ vpdpwssd ym20, ym21, ym16 ; gamma
+ vbroadcasti32x4 m0, [srcq]
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ kmovw k1, r6d
+ psrad ym16, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0
+ psrld m1, 8 ; pd_2
+ pshufb m0, m11
+ paddd m8, m1, m1 ; pd_4
+ vpdpbusd m1, m0, m2
+ call .h
+ psllq m2, m1, 45
+ pslld m1, 13
+ paddd m1, m2
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym17, ym20, 10
+ kmovb k2, k3
+ paddd ym20, ym21
+ vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0
+ psrad ym16, ym20, 10
+ kmovb k3, k2
+ paddd ym20, ym21
+ vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1
+ shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
+ mova m16, m9
+ pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1
+ vpdpwssd m16, m1, m4
+ pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3
+ mova m1, m2
+ vpdpwssd m16, m2, m5
+ shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
+ mova m2, m3
+ pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5
+ vpdpwssd m16, m3, m4
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7
+ vpdpwssd m16, m3, m5
+ ret
+ALIGN function_align
+.h:
+ movu xm5, [srcq+ssq*1]
+ psrad ym16, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ kmovb k2, k3
+ paddd ym18, ym19
+ vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1
+ psrad ym17, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2
+ mova m0, m8
+ vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7
+ vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3
+ vpdpbusd m0, m4, m17
+ vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb
+ vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7
+ vpdpbusd m0, m5, m16
+ vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3)
+ ret
+
+%macro BIDIR_FN 1 ; op
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_ret:
+ RET
+.w4_h16:
+ vpbroadcastd m7, strided
+ pmulld m7, [bidir_sctr_w4]
+ %1 0
+ kxnorw k1, k1, k1
+ vpscatterdd [dstq+m7]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ %1 0
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq ], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w16:
+ %1 0
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m7, [pb_02461357]
+.w32_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m7, [pb_02461357]
+.w64_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m7, [pb_02461357]
+.w128_loop:
+ %1 0
+ vpermq m6, m7, m0
+ %1 2
+ mova [dstq+64*0], m6
+ %1_INC_PTR 4
+ vpermq m6, m7, m0
+ mova [dstq+64*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ paddw m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+%if mmsize == 64
+ vpermq m3, m8, [maskq+%1*32]
+%else
+ vpermq m3, [maskq+%1*16], q3120
+%endif
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*32
+ add tmp2q, %1*64
+ add tmp1q, %1*64
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ mova m8, [base+bilin_v_perm64]
+ vpbroadcastd m5, [base+pw_2048]
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+mmsize*%3]
+ mova m1, [tmp2q+mmsize*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+mmsize*%4]
+ mova m2, [tmp2q+mmsize*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ vpshldw m%2, m3, 8
+ psllw m3, m%2, 10
+%if %5
+ psubb m%2, m5, m%2
+%endif
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pb_m64] ; -1 << 6
+ mova ym10, [base+wm_420_mask+32]
+ vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m5, [wm_420_perm4]
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm4+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ vpermb ym8, ym10, ym8
+ movq [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m11, strided
+ pmulld m11, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ vpdpbusd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vpscatterdd [dstq+m11]{k1}, m0
+ RET
+.w8:
+ mova m5, [wm_420_perm8]
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm8+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 16
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ vpermb m1, m10, m1
+ mova [maskq], xm1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16:
+ mova m5, [wm_420_perm16]
+.w16_loop:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+ psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+ W_MASK 0, 4, 0, 2
+ W_MASK 11, 5, 1, 3
+ mova m2, m8
+ vpdpbusd m2, m4, m9
+ mova m3, m8
+ vpdpbusd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ mova m1, m0
+ vpermt2q m0, m12, m11
+ vpermt2q m1, m13, m11
+ mova [maskq], ym2
+ add maskq, 32
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m14, [wm_420_perm64]
+ mova m10, [wm_420_mask]
+ psrlq m15, m14, 4
+.w128_loop:
+ W_MASK 0, 12, 0, 4
+ W_MASK 11, 13, 1, 5
+ mova m4, m8
+ vpdpbusd m4, m12, m9
+ mova m5, m8
+ vpdpbusd m5, m13, m9
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*1+64*0], m1
+ W_MASK 0, 12, 2, 6
+ W_MASK 11, 13, 3, 7
+ vprold m4, 16
+ vprold m5, 16
+ vpdpbusd m4, m12, m9
+ vpdpbusd m5, m13, m9
+ add tmp1q, 512
+ add tmp2q, 512
+ vpermt2b m4, m10, m5
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0+64*1], m0
+ mova [dstq+strideq*1+64*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pw_m128]
+ mova m10, [base+wm_422_mask]
+ vpbroadcastd m11, [base+pb_127]
+ add wq, r7
+ vpbroadcastd m8, [base+wm_sign+4+r6*4]
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ pand xm8, xm11
+ mova [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m5, strided
+ pmulld m5, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpdpwssd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ pand ym8, ym11
+ mova [maskq], ym8
+ vpscatterdd [dstq+m5]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ pand xm8, xm11
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ pand ym1, ym11
+ mova [maskq], ym1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ pand ym1, ym11
+ mova [maskq], ym1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m5, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m13, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ W_MASK 12, 5, 2, 3
+ mova m2, m8
+ vpdpwssd m2, m4, m9
+ mova m3, m8
+ vpdpwssd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ vpermq m0, m13, m0
+ vpermq m1, m13, m12
+ pand m2, m11
+ mova [maskq], m2
+ add maskq, 64
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ mova m8, [base+wm_444_mask]
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ RET
+.w4_h16:
+ vpbroadcastd m9, strided
+ pmulld m9, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ kxnorw k1, k1, k1
+ mova [maskq], m4
+ vpscatterdd [dstq+m9]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ mova [maskq], m4
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ vpermq m0, m0, q3120
+ mova [maskq], m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m9, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m9, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m11, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1, 1
+ W_MASK 10, 9, 2, 3, 1
+ vpermb m4, m8, m4
+ vpermb m9, m8, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermq m0, m11, m0
+ vpermq m10, m11, m10
+ mova [maskq+64*0], m4
+ mova [maskq+64*1], m9
+ add maskq, 128
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m10
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m6, [base+pb_64]
+ vpbroadcastd m7, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ vpbroadcastd xmm1, [dstq+dsq*2]
+ pinsrd xmm1, [dstq+r6 ], 3
+ mova xmm4, [maskq]
+ mova xmm5, [maskq+tmpq]
+ add maskq, 4*4
+ psubb xmm3, xm6, xmm4
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm3, xmm4
+ punpckhbw xmm1, xmm5
+ punpckhbw xmm3, xmm4
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm7
+ pmulhrsw xmm1, xm7
+ packuswb xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ pextrd [dstq+dsq*2], xmm0, 2
+ pextrd [dstq+r6 ], xmm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ vpbroadcastq ymm2, [dstq+dsq*2]
+ vpbroadcastq ymm3, [dstq+r6 ]
+ mova ymm4, [maskq]
+ mova ymm5, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd ymm0, ymm2, 0x30
+ vpblendd ymm1, ymm3, 0xc0
+ psubb ymm3, ym6, ymm4
+ punpcklbw ymm0, ymm5
+ punpcklbw ymm2, ymm3, ymm4
+ punpckhbw ymm1, ymm5
+ punpckhbw ymm3, ymm4
+ pmaddubsw ymm0, ymm2
+ pmaddubsw ymm1, ymm3
+ pmulhrsw ymm0, ym7
+ pmulhrsw ymm1, ym7
+ packuswb ymm0, ymm1
+ vextracti128 xmm1, ymm0, 1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ movq [dstq+dsq*2], xmm1
+ movhps [dstq+r6 ], xmm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ vzeroupper
+ RET
+.w16:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vinserti32x4 m1, [dstq+dsq*2], 2
+ mova m4, [maskq]
+ vinserti32x4 m1, [dstq+r6 ], 3
+ mova m5, [maskq+tmpq]
+ add maskq, 16*4
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m4, [maskq]
+ mova m5, [maskq+tmpq]
+ add maskq, 32*2
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx512icl_table
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx512icl_table
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [maskq+4*2]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova xmm3, [maskq+8*2]
+.w8_loop:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ mova xmm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm3
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm5
+ pmulhrsw xmm1, xm5
+ packuswb xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 ym3, [maskq+16*2]
+ vbroadcasti32x4 ym4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym4
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [maskq+32*2]
+ vshufi32x4 m3, m4, m4, q2020
+ vshufi32x4 m4, m4, q3131
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_avx512icl_table
+ lea r6, [blend_h_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea maskq, [base+obmc_masks+hq*2]
+ vpbroadcastd m5, [base+pw_512]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xmm2, xmm2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xmm2, xmm3
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti128 ymm4, [blend_shuf]
+ shufpd ymm4, ymm4, 0x03
+.w8_loop:
+ vpbroadcastq ymm1, [dstq+dsq*0]
+ movq xmm0, [dstq+dsq*1]
+ vpblendd ymm0, ymm1, 0x30
+ vpbroadcastd ymm3, [maskq+hq*2]
+ movq xmm1, [tmpq+8*1]
+ vinserti128 ymm1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb ymm3, ymm4
+ punpcklbw ymm0, ymm1
+ pmaddubsw ymm0, ymm3
+ pmulhrsw ymm0, ym5
+ vextracti128 xmm1, ymm0, 1
+ packuswb xmm0, xmm1
+ movhps [dstq+dsq*0], xmm0
+ movq [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ vzeroupper
+ RET
+.w16:
+ vbroadcasti32x4 ym4, [blend_shuf]
+ shufpd ym4, ym4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vpbroadcastd ym3, [maskq+hq*2]
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ pshufb ym3, ym4
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym3
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x4 m4, [blend_shuf]
+ shufpd m4, m4, 0xf0
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ vpbroadcastw m3, [maskq+hq*2]
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m6, [maskq+hq*2]
+ mova m2, [dstq+64*0]
+ mova m1, [tmpq+64*0]
+ mova m3, [dstq+64*1]
+ mova m4, [tmpq+64*1]
+ add tmpq, 64*2
+ punpcklbw m0, m2, m1
+ punpckhbw m2, m1
+ pmaddubsw m0, m6
+ pmaddubsw m2, m6
+ punpcklbw m1, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ REPX {pmulhrsw x, m5}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k3, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti32x4 m15, [base+pb_8x0_8x8]
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+ mova m16, [base+resize_permA]
+ mova m17, [base+resize_permB]
+ mova xm18, [base+resize_permC]
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k4, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k4, k4
+ jz .load
+ vextracti32x8 ym12, m0, 1
+ vextracti32x8 ym13, m1, 1
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m10{k1}, [srcq+ym0]
+ vpgatherdq m11{k2}, [srcq+ym12]
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
+ vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
+ mova m12, m16
+ mova m13, m17
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m10, m14
+ pshufb m11, m0
+ vpermi2d m12, m10, m11
+ vpermi2d m13, m10, m11
+ jmp .filter
+.load:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m12{k1}, [srcq+m0+0]
+ vpgatherdd m13{k2}, [srcq+m0+4]
+.filter:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
+ mova m14, m2
+ vpdpbusd m14, m12, m10
+ vpdpbusd m14, m13, m11
+ packssdw m14, m14
+ pmulhrsw m14, m3
+ packuswb m14, m14
+ vpermd m14, m18, m14
+ mova [dstq+xq], xm14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm
new file mode 100644
index 0000000000..54939c647a
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_sse.asm
@@ -0,0 +1,9599 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2 @4
+ db 45, 19, 64, 0
+ ; 4 @8
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8 @16
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16 @32
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32 @64
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+
+wm_420_sign: times 4 dw 258
+ times 4 dw 257
+wm_422_sign: times 8 db 128
+ times 8 db 127
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+bdct_lb_dw: times 4 db 0
+ times 4 db 4
+ times 4 db 8
+ times 4 db 12
+
+pb_64: times 16 db 64
+pw_m256: times 8 dw -256
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_8: times 8 dw 8
+pw_15: times 8 dw 15
+pw_26: times 8 dw 26
+pw_34: times 8 dw 34
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+pw_6903: times 8 dw 6903
+pw_8192: times 8 dw 8192
+pd_32: times 4 dd 32
+pd_63: times 4 dd 63
+pd_512: times 4 dd 512
+pd_16384: times 4 dd 16484
+pd_32768: times 4 dd 32768
+pd_262144:times 4 dd 262144
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000:times 4 dd 0x4000
+pq_0x40000000: times 2 dq 0x40000000
+
+const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
+ ; [-1, 0)
+ db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0
+ db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0
+ db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0
+ db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0
+ db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0
+ db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0
+ db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0
+ db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0
+ db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0
+ db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0
+ db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0
+ db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0
+ db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0
+ db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0
+ db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0
+ db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0
+ db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0
+ db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0
+ db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0
+ db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0
+ db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0
+ db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0
+ db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0
+ db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0
+ db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0
+ db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0
+ db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0
+ db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0
+ db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0
+ db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0
+ db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0
+ db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0
+ ; [0, 1)
+ db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0
+ db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0
+ db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1
+ db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1
+ db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1
+ db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1
+ db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1
+ db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1
+ db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2
+ db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2
+ db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2
+ db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2
+ db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2
+ db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2
+ db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2
+ db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2
+ db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2
+ db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2
+ db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2
+ db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2
+ db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2
+ db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2
+ db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2
+ db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2
+ db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2
+ db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1
+ db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2
+ db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1
+ db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1
+ db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1
+ db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0
+ db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0
+ ; [1, 2)
+ db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0
+ db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1
+ db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1
+ db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1
+ db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1
+ db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2
+ db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2
+ db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2
+ db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3
+ db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3
+ db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3
+ db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4
+ db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4
+ db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4
+ db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4
+ db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4
+ db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4
+ db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4
+ db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4
+ db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4
+ db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4
+ db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4
+ db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4
+ db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3
+ db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3
+ db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3
+ db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2
+ db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2
+ db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2
+ db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1
+ db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1
+ db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0
+ db 0, 0, 2, -1, 0, 0, 127, 0
+
+pw_258: times 2 dw 258
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 2-*
+ ;evaluated at definition time (in loop below)
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ ; dynamically generated label
+ %%table:
+ %rep %0 - 2 ; repeat for num args
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep)
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ %define base t0-put_ssse3
+%else
+ DECLARE_REG_TMP 7
+ %define base 0
+%endif
+
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+ mov %1, dsm ; restore dsq
+ %endif
+%endmacro
+
+cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ tzcnt wd, wm
+ mov hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t0+wq*2+table_offset(put,)]
+ add wq, t0
+ RESTORE_DSQ_32 t0
+ jmp wq
+.put_w2:
+ movzx r4d, word [srcq+ssq*0]
+ movzx r6d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4w
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0x00ff00ff
+ mova m4, [base+bilin_h_shuf8]
+ mova m0, [base+bilin_h_shuf4]
+ add mxyd, 0x00100010
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ pshufd m5, m5, q0000
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+ mova m3, [base+pw_2048]
+ add wq, t0
+ movifnidn dsq, dsmp
+ jmp wq
+.h_w2:
+ pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+ movd m0, [srcq+ssq*0]
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m0, m1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movq m4, [srcq+ssq*0]
+ movhps m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m0
+ pmaddubsw m4, m5
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movd [dstq+dsq*0], m4
+ psrlq m4, 32
+ movd [dstq+dsq*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+mmsize*0+8*0]
+ movu m1, [srcq+mmsize*0+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+mmsize*1+8*0]
+ movu m2, [srcq+mmsize*1+8*1]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ mov r6, -16*3
+.h_w64_loop:
+ movu m0, [srcq+r6+16*3+8*0]
+ movu m1, [srcq+r6+16*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*3], m0
+ add r6, 16
+ jle .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -16*7
+.h_w128_loop:
+ movu m0, [srcq+r6+16*7+8*0]
+ movu m1, [srcq+r6+16*7+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*7], m0
+ add r6, 16
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0x00ff00ff
+ mova m5, [base+pw_2048]
+ add mxyd, 0x00100010
+ add wq, t0
+ movd m4, mxyd
+ pshufd m4, m4, q0000
+ movifnidn dsq, dsmp
+ jmp wq
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw m0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pshuflw m1, m0, q2301
+ pinsrw m0, [srcq+ssq*0], 0 ; 2 1
+ punpcklbw m1, m0
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd r6d, m1
+ mov [dstq+dsq*1], r6w
+ shr r6d, 16
+ mov [dstq+dsq*0], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd m0, [srcq+ssq*0]
+.v_w4_loop:
+ movd m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m2 ; 0 1
+ punpckldq m2, m0 ; 1 2
+ punpcklbw m1, m2
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ ;
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+ssq*0]
+.v_w8_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2
+ punpcklbw m2, m0
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+%macro PUT_BILIN_V_W16 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ mova m2, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+.v_w16:
+ PUT_BILIN_V_W16
+ RET
+.v_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w32:
+ lea r6d, [hq+(1<<16)]
+.v_w16gt:
+ mov r4, srcq
+%if ARCH_X86_64
+ mov r7, dstq
+%endif
+.v_w16gt_loop:
+ PUT_BILIN_V_W16
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%else
+ mov dstq, dstmp
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstmp, dstq
+%endif
+ sub r6d, 1<<16
+ jg .v_w16gt
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ mova m7, [base+pw_15]
+ movd m6, mxyd
+ add wq, t0
+ pshuflw m6, m6, q0000
+ paddb m5, m5
+ punpcklqdq m6, m6
+ jmp wq
+.hv_w2:
+ RESTORE_DSQ_32 t0
+ movd m0, [srcq+ssq*0]
+ punpckldq m0, m0
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ punpckldq m1, m2
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 _ 2 _
+ shufps m2, m0, m1, q1032 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2 ; 2 * (src[x + src_stride] - src[x])
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4
+ pavgw m2, m7 ; src[x] + 8
+ paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+ psrlw m1, 4
+ packuswb m1, m1
+%if ARCH_X86_64
+ movq r6, m1
+%else
+ pshuflw m1, m1, q2020
+ movd r6d, m1
+%endif
+ mov [dstq+dsq*0], r6w
+ shr r6, gprsize*4
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ shufps m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhw m1, m6
+ pavgw m2, m7
+ paddw m1, m2
+ psrlw m1, 4
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ pmulhw m1, m6
+ pavgw m0, m7
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ psubw m3, m0, m2
+ pmulhw m3, m6
+ pavgw m2, m7
+ paddw m3, m2
+ psrlw m1, 4
+ psrlw m3, 4
+ packuswb m1, m3
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .hv_w16_start
+.hv_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w16_start
+.hv_w32:
+ lea r6d, [hq+(1<<16)]
+.hv_w16_start:
+ mov r4, srcq
+%if ARCH_X86_32
+ %define m8 [dstq]
+%else
+ mov r7, dstq
+%endif
+.hv_w16:
+ movifnidn dsq, dsmp
+%if WIN64
+ movaps r4m, m8
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w16_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova m8, m2
+ psubw m2, m0
+ pmulhw m2, m6
+ pavgw m0, m7
+ paddw m2, m0
+ mova m0, m3
+ psubw m3, m1
+ pmulhw m3, m6
+ pavgw m1, m7
+ paddw m3, m1
+ mova m1, m0
+ mova m0, m8
+ psrlw m2, 4
+ psrlw m3, 4
+ packuswb m2, m3
+ mova [dstq], m2
+ add dstq, dsmp
+ dec hd
+ jg .hv_w16_loop
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w16_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ psrldq %2, %1, 1
+ punpcklbw %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ psrldq %2, %1, 1
+ punpckhbw %3, %1, %2
+ punpcklbw %1, %2
+ punpcklqdq %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2
+ %else
+ punpckhwd %3, %1, %4
+ punpcklwd %1, %4
+ pmaddwd %3, %2
+ pmaddwd %1, %2
+ psrad %3, %5
+ psrad %1, %5
+ packssdw %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+%if ARCH_X86_32
+ %define base r6-prep%+SUFFIX
+%else
+ %define base 0
+%endif
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep%+SUFFIX
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+%if notcpuflag(ssse3)
+ add r6, prep_ssse3 - prep_sse2
+ jmp prep_ssse3
+%else
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ pxor m4, m4
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd m0, [srcq+strideq*0]
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklbw m0, m4
+ punpcklbw m2, m4
+ psllw m0, 4
+ psllw m2, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq m0, [srcq+strideq*0]
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m1, [srcq+strideq*0]
+ movu m3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w128:
+ mov r3, -128
+ jmp .prep_w32_start
+.prep_w64:
+ mov r3, -64
+ jmp .prep_w32_start
+.prep_w32:
+ mov r3, -32
+.prep_w32_start:
+ sub srcq, r3
+.prep_w32_vloop:
+ mov r6, r3
+.prep_w32_hloop:
+ movu m1, [srcq+r6+16*0]
+ movu m3, [srcq+r6+16*1]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add r6, 32
+ jl .prep_w32_hloop
+ add srcq, strideq
+ dec hd
+ jg .prep_w32_vloop
+ RET
+%endif
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
+ mova m4, [base+bilin_h_shuf8]
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ add mxyd, 16
+%endif
+ movd m5, mxyd
+ mov mxyd, r6m ; my
+ pshufd m5, m5, q0000
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+ pxor m6, m6
+%endif
+ add wq, r6
+ jmp wq
+.h_w4:
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf4]
+%endif
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H4 m0, m4, m2
+ PMADDUBSW m0, m5, m6, m2, 0
+ PSHUFB_BILIN_H4 m1, m4, m2
+ PMADDUBSW m1, m5, m6, m2, 0
+ mova [tmpq+0 ], m0
+ mova [tmpq+16], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ movu m2, [srcq+strideq*1+8*0]
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w128:
+ mov r3, -128
+ jmp .h_w32_start
+.h_w64:
+ mov r3, -64
+ jmp .h_w32_start
+.h_w32:
+ mov r3, -32
+.h_w32_start:
+ sub srcq, r3
+.h_w32_vloop:
+ mov r6, r3
+.h_w32_hloop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ movu m2, [srcq+r6+8*2]
+ movu m3, [srcq+r6+8*3]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add r6, 32
+ jl .h_w32_hloop
+ add srcq, strideq
+ dec hd
+ jg .h_w32_vloop
+ RET
+.v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+%endif
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ pxor m6, m6
+ add mxyd, 16
+%endif
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd m5, mxyd
+ pshufd m5, m5, q0000
+ jmp wq
+.v_w4:
+ movd m0, [srcq+strideq*0]
+.v_w4_loop:
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m0, m1
+ punpckldq m1, m2
+ punpcklbw m0, m1 ; 01 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m3
+ punpckldq m3, m0
+ punpcklbw m2, m3 ; 23 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+strideq*0]
+.v_w8_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m0, m1 ; 01
+ punpcklbw m1, m2 ; 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ movq m0, [srcq+strideq*0]
+ punpcklbw m2, m3 ; 23
+ punpcklbw m3, m0 ; 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m1
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m0, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0]
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*3], m1
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*4], m4
+ punpcklbw m4, m3, m0
+ punpckhbw m3, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*5], m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*6], m4
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w128:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 256
+ jmp .v_w32_start
+.v_w64:
+ lea r3d, [hq+(1<<8)]
+ mov r6d, 128
+ jmp .v_w32_start
+.v_w32:
+ xor r3d, r3d
+ mov r6d, 64
+.v_w32_start:
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+ mov r5, srcq
+.v_w32_hloop:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0+16*0]
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ mova [tmpq+16*3], m1
+ movu m1, [srcq+strideq*0+16*1]
+ add tmpq, r6
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ mova [tmpq+16*1], m2
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ mova [tmpq+16*3], m3
+ add tmpq, r6
+ sub hd, 2
+ jg .v_w32_vloop
+ add r5, 32
+ movzx hd, r3b
+ mov srcq, r5
+%if ARCH_X86_64
+ add r7, 16*4
+ mov tmpq, r7
+%else
+ mov tmpq, tmpmp
+ add tmpq, 16*4
+ mov tmpmp, tmpq
+%endif
+ sub r3d, 1<<8
+ jg .v_w32_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+ imul mxyd, 0x08000800
+ WIN64_SPILL_XMM 8
+%else
+ or mxyd, 1<<16
+ WIN64_SPILL_XMM 9
+ %if ARCH_X86_64
+ mova m8, [base+pw_8]
+ %else
+ %define m8 [base+pw_8]
+ %endif
+ pxor m7, m7
+%endif
+ movd m6, mxyd
+ add wq, r6
+ pshufd m6, m6, q0000
+ jmp wq
+.hv_w4:
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+strideq*0]
+%else
+ movhps m0, [srcq+strideq*0]
+%endif
+ lea r3, [strideq*3]
+ PSHUFB_BILIN_H4 m0, m4, m3
+ PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movhps m1, [srcq+strideq*2]
+ movq m2, [srcq+r3 ]
+ lea srcq, [srcq+strideq*4]
+ movhps m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m1, m4, m3
+ PSHUFB_BILIN_H4 m2, m4, m3
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
+ PMADDUBSW m2, m5, m7, m4, 0 ; 3 4
+ shufpd m0, m1, 0x01 ; 0 1
+ shufpd m3, m1, m2, 0x01 ; 2 3
+ psubw m1, m0
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m0
+ mova m0, m2
+ psubw m2, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*0], m1
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1
+ PMADDUBSW m2, m5, m7, m4, 0 ; 2
+ psubw m3, m1, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova m0, m2
+ psubw m2, m1
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m1
+ mova [tmpq+16*0], m3
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r5d, 256
+ jmp .hv_w16_start
+.hv_w64:
+ lea r3d, [hq+(3<<8)]
+ mov r5d, 128
+ jmp .hv_w16_start
+.hv_w32:
+ lea r3d, [hq+(1<<8)]
+ mov r5d, 64
+ jmp .hv_w16_start
+.hv_w16:
+ xor r3d, r3d
+ mov r5d, 32
+.hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+ mov r6, srcq
+%endif
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+.hv_w16_hloop:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0a
+ PMADDUBSW m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+ movu m2, [srcq+strideq*1+8*0]
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m2, m5, m7, m4, 0 ; 1a
+ psubw m3, m2, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova [tmpq+16*0], m3
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m3, m5, m7, m4, 0 ; 1b
+ psubw m0, m3, m1
+ PMULHRSW m0, m6, m4, m8, 4
+ paddw m0, m1
+ mova [tmpq+16*1], m0
+ add tmpq, r5
+ movu m0, [srcq+strideq*0+8*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 2a
+ psubw m1, m0, m2
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m2
+ mova [tmpq+16*0], m1
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 2b
+ psubw m2, m1, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*1], m2
+ add tmpq, r5
+ sub hd, 2
+ jg .hv_w16_vloop
+ movzx hd, r3b
+%if ARCH_X86_64
+ add r6, 16
+ add r7, 2*16
+ mov srcq, r6
+ mov tmpq, r7
+%elif cpuflag(ssse3)
+ mov tmpq, tmpm
+ add r6, 16
+ add tmpq, 2*16
+ mov srcq, r6
+ mov tmpm, tmpq
+%else
+ mov srcq, srcm
+ mov tmpq, tmpm
+ add srcq, 16
+ add tmpq, 2*16
+ mov srcm, srcq
+ mov tmpm, tmpq
+%endif
+ sub r3d, 1<<8
+ jg .hv_w16_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+%endmacro
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+FN put_8tap, sharp, SHARP, SHARP
+FN put_8tap, sharp_smooth, SHARP, SMOOTH
+FN put_8tap, smooth_sharp, SMOOTH, SHARP
+FN put_8tap, smooth, SMOOTH, SMOOTH
+FN put_8tap, sharp_regular, SHARP, REGULAR
+FN put_8tap, regular_sharp, REGULAR, SHARP
+FN put_8tap, smooth_regular, SMOOTH, REGULAR
+FN put_8tap, regular_smooth, REGULAR, SMOOTH
+FN put_8tap, regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+ %define base_reg r1
+ %define base base_reg-put_ssse3
+%else
+ %define base_reg r8
+ %define base 0
+%endif
+
+cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+%if ARCH_X86_64
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+%else
+ imul ssd, mym, 0x010101
+ add ssd, t1d ; 8tap_v, my, 4tap_v
+ mov srcq, srcm
+%endif
+ mov wd, wm
+ movifnidn hd, hm
+ LEA base_reg, put_ssse3
+ test mxd, 0xf00
+ jnz .h
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(put,)]
+ add wq, base_reg
+; put_bilin mangling jump
+%assign stack_offset org_stack_offset
+ movifnidn dsq, dsmp
+ movifnidn ssq, ssmp
+%if WIN64
+ pop r8
+%endif
+ lea r6, [ssq*3]
+ jmp wq
+.h:
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .hv
+ movifnidn ssq, ssmp
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ jl .h_w2
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
+ movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ mova m7, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
+ add wq, base_reg
+ jmp wq
+.h_w2:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ mova m4, [base+subpel_h_shuf4]
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
+.h_w2_loop:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pmaddubsw m0, m3
+ phaddw m0, m0
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ mova m6, [base+subpel_h_shufA]
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
+.h_w4_loop:
+ movq m0, [srcq+ssq*0] ; 1
+ movq m1, [srcq+ssq*1] ; 2
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m6 ; subpel_h_shufA
+ pshufb m1, m6 ; subpel_h_shufA
+ pmaddubsw m0, m3 ; subpel_filters
+ pmaddubsw m1, m3 ; subpel_filters
+ phaddw m0, m1
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ %if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+ %endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; C4
+ pmaddubsw %1, m5 ; A0
+ paddw %3, %4 ; C4+B0
+ paddw %1, %2 ; A0+B4
+ phaddw %1, %3
+ paddw %1, m7 ; pw34
+ psraw %1, 6
+%endmacro
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+%if ARCH_X86_32
+ movq [dstq], m0
+ add dstq, dsm
+ movhps [dstq], m0
+ add dstq, dsm
+%else
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w128:
+ mov r4, -16*7
+ jmp .h_w16_start
+.h_w64:
+ mov r4, -16*3
+ jmp .h_w16_start
+.h_w32:
+ mov r4, -16*1
+ jmp .h_w16_start
+.h_w16:
+ xor r4d, r4d
+.h_w16_start:
+ sub srcq, r4
+ sub dstq, r4
+.h_w16_loop_v:
+ mov r6, r4
+.h_w16_loop_h:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 16
+ jle .h_w16_loop_h
+ add srcq, ssq
+ add dstq, dsmp
+ dec hd
+ jg .h_w16_loop_v
+ RET
+.v:
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+%endif
+ tzcnt r6d, wd
+ movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ punpcklwd m0, m0
+ mova m7, [base+pw_512]
+ add r6, base_reg
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
+ ALLOC_STACK -16*4
+%assign regs_used 7
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ lea ssq, [ssq*3]
+ sub srcq, ssq
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ mov dsq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ lea ss3q, [ssq*3]
+ pshufd m8, m0, q0000
+ sub srcq, ss3q
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+%endif
+ jmp r6
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+%else
+ movd m2, [srcq+ssq*2]
+ add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
+%endif
+ punpcklwd m1, m0 ; 0 1
+ punpcklwd m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpcklwd m2, m5 ; 2 3
+ punpcklwd m5, m3 ; 3 4
+ punpcklwd m3, m4 ; 4 5
+ punpcklwd m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ punpcklwd m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpcklwd m4, m0 ; 7 8
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd r6d, m5
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*4+gprsize]
+ mov dstm, dstq
+%endif
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
+.v_w4_loop0:
+%endif
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+%else
+ movd m2, [srcq+ssq*2]
+ add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
+%endif
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m4 ; 4 5
+ punpckldq m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
+.v_w4_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+ sub r6d, 1<<16
+ jg .v_w4_loop0
+%endif
+ RET
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
+.v_w8_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
+.v_w8_loop:
+ movq m13, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m14, m1, subpel0 ; a0
+ mova m1, m3
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ mova m12, m0
+ pmaddubsw m4, subpel1 ; b1
+ movq m0, [srcq+ssq*0]
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ pmaddubsw m5, subpel2 ; a2
+ mova m4, m6
+ pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m13 ; 67
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
+ paddw m15, m6
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ movq [dstq+dsq*0], m14
+ movhps [dstq+dsq*1], m14
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r4, 8
+ add r7, 8
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ lea r6, [ssq*3]
+ sub srcq, r6
+ %define base_reg r6
+ mov r6, r1; use as new base
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov dsq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+ cmp wd, 4
+ je .hv_w4
+.hv_w2:
+ mova m6, [base+subpel_h_shuf4]
+ movq m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*1] ; 0 _ 1
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0] ; 2
+ movhps m0, [srcq+ssq*1] ; 2 _ 3
+ lea srcq, [srcq+ssq*2]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+ movq m0, [srcq+ssq*2] ; 2
+ add srcq, ss3q
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
+%endif
+ pshufb m2, m6 ; 0 ~ 1 ~
+ pshufb m0, m6 ; 2 ~ 3 ~
+ pmaddubsw m2, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m2, m0 ; 0 1 2 3
+ pmulhrsw m2, w8192reg
+%if ARCH_X86_32
+ movq m3, [srcq+ssq*0] ; 4
+ movhps m3, [srcq+ssq*1] ; 4 _ 5
+ lea srcq, [srcq+ssq*2]
+%else
+ movq m3, [srcq+ssq*1] ; 4
+ movhps m3, [srcq+ssq*2] ; 4 _ 5
+ add srcq, ss3q
+%endif
+ movq m0, [srcq+ssq*0] ; 6
+ pshufb m3, m6 ; 4 ~ 5 ~
+ pshufb m0, m6 ; 6 ~
+ pmaddubsw m3, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m3, m0 ; 4 5 6 _
+ pmulhrsw m3, w8192reg
+ palignr m4, m3, m2, 4; V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
+ punpckhwd m2, m4 ; V 23 34 2 3 3 4
+ pshufd m0, m3, q2121; V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56 4 5 5 6
+.hv_w2_loop:
+ movq m4, [srcq+ssq*1] ; V 7
+ lea srcq, [srcq+ssq*2] ; V
+ movhps m4, [srcq+ssq*0] ; V 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2 ; V
+ pmaddwd m2, subpelv1 ; V a1 b1
+ paddd m5, m2 ; V
+ mova m2, m3 ; V
+ pmaddwd m3, subpelv2 ; a2 b2
+ phaddw m4, m4
+ pmulhrsw m4, w8192reg
+ paddd m5, m3 ; V
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; V 67 78
+ pmaddwd m4, m3, subpelv3 ; V a3 b3
+ paddd m5, d512reg
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+%undef w8192reg
+%undef d512reg
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%macro SAVELINE_W4 3
+ mova [rsp+mmsize*hv4_line_%3_%2], %1
+%endmacro
+%macro RESTORELINE_W4 3
+ mova %1, [rsp+mmsize*hv4_line_%3_%2]
+%endmacro
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 0 _ _ _
+ movhps m5, [srcq+ssq*1] ; 0 _ 1 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 2 _ _ _
+ movhps m4, [srcq+ssq*1] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*2]
+%else
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
+ movhps m4, [srcq+ss3q ] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 4 _ _ _
+ movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 6 _ _ _
+ add srcq, ssq
+%else
+ movq m4, [srcq+ssq*2] ; 6 _ _ _
+ add srcq, ss3q
+%endif
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m5, 10
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m4, m5, 10
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4 ; d -> w
+ packuswb m5, m5 ; w -> b
+ pshuflw m5, m5, q3120
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+%macro SAVELINE_W8 2
+ mova [rsp+hv8_line_%1*mmsize], %2
+%endmacro
+%macro RESTORELINE_W8 2
+ mova %2, [rsp+hv8_line_%1*mmsize]
+%endmacro
+ shr mxd, 16
+ sub srcq, 3
+%if ARCH_X86_32
+ %define base_reg r1
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*13+gprsize*1]
+ %define dsm [rsp+mmsize*13+gprsize*2]
+ mov r6, [rstk+stack_offset+gprsize*2]
+ mov dsm, r6
+%endif
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r6, [ssq*3]
+ mov dstm, dstq
+ sub srcq, r6
+%else
+ ALLOC_STACK 16*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea ss3q, [ssq*3]
+ mov r7, dstq
+ sub srcq, ss3q
+%endif
+ shl wd, 14
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
+.hv_w8_loop0:
+ movu m4, [srcq+ssq*0] ; 0 = _ _
+ movu m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+%endif
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ %if ARCH_X86_32
+ pshufb %3, %1, [base+subpel_h_shufB]
+ pshufb %4, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %3, %1, %6 ; subpel_h_shufB
+ pshufb %4, %1, %7 ; subpel_h_shufC
+ pshufb %1, %5 ; subpel_h_shufA
+ %endif
+ pmaddubsw %2, %3, subpelh0 ; subpel +0 C0
+ pmaddubsw %4, subpelh1; subpel +4 B4
+ pmaddubsw %3, subpelh1; C4
+ pmaddubsw %1, subpelh0; A0
+ paddw %2, %4 ; C0+B4
+ paddw %1, %3 ; A0+C4
+ phaddw %1, %2
+%endmacro
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
+ movu m6, [srcq+ssq*0] ; 2 = _ _
+ movu m0, [srcq+ssq*1] ; 3 = _ _
+ lea srcq, [srcq+ssq*2]
+%else
+ movu m6, [srcq+ssq*2] ; 2 = _ _
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0] ; 3 = _ _
+%endif
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ mova m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
+ movu m4, [srcq+ssq*0] ; 4 = _ _
+ movu m5, [srcq+ssq*1] ; 5 = _ _
+ lea srcq, [srcq+ssq*2]
+%else
+ movu m4, [srcq+ssq*1] ; 4 = _ _
+ movu m5, [srcq+ssq*2] ; 5 = _ _
+ add srcq, ss3q
+%endif
+ movu m6, [srcq+ssq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_512]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_512]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+ssq*1] ; 7
+ movu m4, [srcq+ssq*2] ; 8
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 10
+ psrad m1, 10
+ packssdw m2, m1 ; d -> w
+ packuswb m2, m1 ; w -> b
+ movd [dstq+dsq*0], m2
+ psrlq m2, 32
+%if ARCH_X86_32
+ add dstq, dsm
+ movd [dstq+dsq*0], m2
+ add dstq, dsm
+%else
+ movd [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 4
+ add r7, 4
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w8_loop0
+ RET
+
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ %if %5 == 1
+ pcmpeqd %2, %2
+ psrlq %2, 32
+ %endif
+ psrldq %3, %1, 1
+ pshufd %3, %3, q2301
+ pand %1, %2
+ pandn %4, %2, %3
+ por %1, %4
+ %endif
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+ psrlq %1, %2, 16
+ %elifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+ palignr %1, %2, %3, %4
+ %else
+ %if %0 == 4
+ %assign %%i regnumof%+%1 + 1
+ %define %%tmp m %+ %%i
+ %else
+ %define %%tmp %5
+ %endif
+ psrldq %1, %3, %4
+ pslldq %%tmp, %2, 16-%4
+ por %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+ phaddw %1, %2
+ %elifnidn %1, %2
+ %if %4 == 1
+ mova %3, [base+pw_1]
+ %endif
+ pmaddwd %1, %3
+ pmaddwd %2, %3
+ packssdw %1, %2
+ %else
+ %if %4 == 1
+ pmaddwd %1, [base+pw_1]
+ %else
+ pmaddwd %1, %3
+ %endif
+ packssdw %1, %1
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2, %3
+ %else
+ paddw %1, %2, %3
+ psraw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+ PMULHRSW_POW2 %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+ movd %1, [%2+0]
+ movd %3, [%2+1]
+ movd %4, [%2+2]
+ movd %5, [%2+3]
+ punpckldq %1, %3
+ punpckldq %4, %5
+ punpcklqdq %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+ movu m%1, [%2]
+ pshufb m2, m%1, m11 ; subpel_h_shufB
+ pshufb m3, m%1, m9 ; subpel_h_shufC
+ pshufb m%1, m10 ; subpel_h_shufA
+ %else
+ %if ARCH_X86_64
+ SWAP m12, m5
+ SWAP m13, m6
+ SWAP m14, m7
+ %define %%mx0 m%+%%i
+ %define %%mx1 m%+%%j
+ %assign %%i 0
+ %rep 12
+ movd %%mx0, [%2+%%i]
+ %assign %%i %%i+1
+ %endrep
+ %assign %%i 0
+ %rep 6
+ %assign %%j %%i+1
+ punpckldq %%mx0, %%mx1
+ %assign %%i %%i+2
+ %endrep
+ %assign %%i 0
+ %rep 3
+ %assign %%j %%i+2
+ punpcklqdq %%mx0, %%mx1
+ %assign %%i %%i+4
+ %endrep
+ SWAP m%1, m0
+ SWAP m2, m4
+ SWAP m3, m8
+ SWAP m5, m12
+ SWAP m6, m13
+ SWAP m7, m14
+ %else
+ PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7
+ SWAP m%1, m0
+ %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+ PREP_8TAP_H_LOAD %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m8, m1
+ SWAP m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+ mova m4, m2
+ PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0
+ PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4
+ PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4
+ PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m1, m8
+ SWAP m7, m9
+ %endif
+ paddw m3, m4
+ paddw m%1, m2
+ PHADDW m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+ PMULHRSW_8192 m%1, m%1, m7
+ %else
+ PMULHRSW_8192 m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+ movu %1, [%2]
+ pshufb m2, %1, shufB
+ pshufb m3, %1, shufC
+ pshufb %1, shufA
+ %else
+ PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4
+ %endif
+ mova m1, m2
+ PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+ PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+ PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4
+ PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0
+ paddw m1, m3 ; C0+B4
+ paddw %1, m2 ; A0+C4
+ PHADDW %1, m1, %3, 1
+%endmacro
+
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
+
+FN prep_8tap, sharp, SHARP, SHARP
+FN prep_8tap, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap, smooth, SMOOTH, SMOOTH
+FN prep_8tap, sharp_regular, SHARP, REGULAR
+FN prep_8tap, regular_sharp, REGULAR, SHARP
+FN prep_8tap, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap, regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep%+SUFFIX
+%else
+ %define base_reg r7
+ %define base 0
+%endif
+cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ mov wd, wm
+ movifnidn srcd, srcm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ LEA base_reg, prep_ssse3
+ tzcnt wd, wd
+ movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+ pxor m4, m4
+ add wq, base_reg
+ movifnidn strided, stridem
+ lea r6, [strideq*3]
+ %assign stack_offset org_stack_offset
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ LEA base_reg, prep%+SUFFIX
+ test myd, 0xf00
+ jnz .hv
+%if cpuflag(ssse3)
+ WIN64_SPILL_XMM 12
+%else
+ WIN64_SPILL_XMM 16
+%endif
+%if ARCH_X86_32
+ %define strideq r6
+ mov strideq, stridem
+%endif
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+ %else
+ %define m10 [base+subpel_h_shufA]
+ %define m11 [base+subpel_h_shufB]
+ %define m9 [base+subpel_h_shufC]
+ %endif
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+ movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
+%else
+ punpcklbw m6, m6
+ psraw m6, 8
+ %if ARCH_X86_64
+ mova m7, [pw_2]
+ mova m15, [pw_1]
+ %else
+ %define m15 m4
+ %endif
+ pshufd m5, m6, q1010
+ punpckhqdq m6, m6
+%endif
+ add wq, base_reg
+ jmp wq
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if cpuflag(ssse3)
+ mova m6, [base+pw_8192]
+ mova m5, [base+subpel_h_shufA]
+ pshufd m4, m4, q0000
+%else
+ mova m6, [base+pw_2]
+ %if ARCH_X86_64
+ mova m14, [pw_1]
+ %else
+ %define m14 m7
+ %endif
+ punpcklbw m4, m4
+ psraw m4, 8
+ punpcklqdq m4, m4
+%endif
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+%if cpuflag(ssse3)
+ movq m0, [srcq+strideq*0] ; 0
+ movq m1, [srcq+strideq*1] ; 1
+ %if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m2, [srcq+strideq*0] ; 2
+ movq m3, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+ %else
+ movq m2, [srcq+strideq*2] ; 2
+ movq m3, [srcq+stride3q ] ; 3
+ lea srcq, [srcq+strideq*4]
+ %endif
+ pshufb m0, m5
+ pshufb m1, m5
+ pshufb m2, m5
+ pshufb m3, m5
+%elif ARCH_X86_64
+ movd m0, [srcq+strideq*0+0]
+ movd m12, [srcq+strideq*0+1]
+ movd m1, [srcq+strideq*1+0]
+ movd m5, [srcq+strideq*1+1]
+ movd m2, [srcq+strideq*2+0]
+ movd m13, [srcq+strideq*2+1]
+ movd m3, [srcq+stride3q +0]
+ movd m7, [srcq+stride3q +1]
+ punpckldq m0, m12
+ punpckldq m1, m5
+ punpckldq m2, m13
+ punpckldq m3, m7
+ movd m12, [srcq+strideq*0+2]
+ movd m8, [srcq+strideq*0+3]
+ movd m5, [srcq+strideq*1+2]
+ movd m9, [srcq+strideq*1+3]
+ movd m13, [srcq+strideq*2+2]
+ movd m10, [srcq+strideq*2+3]
+ movd m7, [srcq+stride3q +2]
+ movd m11, [srcq+stride3q +3]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m12, m8
+ punpckldq m5, m9
+ punpckldq m13, m10
+ punpckldq m7, m11
+ punpcklqdq m0, m12 ; 0
+ punpcklqdq m1, m5 ; 1
+ punpcklqdq m2, m13 ; 2
+ punpcklqdq m3, m7 ; 3
+%else
+ movd m0, [srcq+strideq*0+0]
+ movd m1, [srcq+strideq*0+1]
+ movd m2, [srcq+strideq*0+2]
+ movd m3, [srcq+strideq*0+3]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklqdq m0, m2 ; 0
+ movd m1, [srcq+strideq*1+0]
+ movd m2, [srcq+strideq*1+1]
+ movd m3, [srcq+strideq*1+2]
+ movd m7, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m1, m2
+ punpckldq m3, m7
+ punpcklqdq m1, m3 ; 1
+ movd m2, [srcq+strideq*0+0]
+ movd m3, [srcq+strideq*0+1]
+ movd m7, [srcq+strideq*0+2]
+ movd m5, [srcq+strideq*0+3]
+ punpckldq m2, m3
+ punpckldq m7, m5
+ punpcklqdq m2, m7 ; 2
+ movd m3, [srcq+strideq*1+0]
+ movd m7, [srcq+strideq*1+1]
+ punpckldq m3, m7
+ movd m7, [srcq+strideq*1+2]
+ movd m5, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m7, m5
+ punpcklqdq m3, m7 ; 3
+%endif
+ PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
+ PMADDUBSW m1, m4, m5, m7, 0
+ PMADDUBSW m2, m4, m5, m7, 0
+ PMADDUBSW m3, m4, m5, m7, 0
+ PHADDW m0, m1, m14, ARCH_X86_32
+ PHADDW m2, m3, m14, 0
+ PMULHRSW_8192 m0, m0, m6
+ PMULHRSW_8192 m2, m2, m6
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+strideq*0
+ PREP_8TAP_H 1, srcq+strideq*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 32
+ sub hd, 2
+%else
+ PREP_8TAP_H 0, srcq
+ mova [tmpq], m0
+ add srcq, strideq
+ add tmpq, 16
+ dec hd
+%endif
+ jg .h_w8
+ RET
+.h_w16:
+ mov r3, -16*1
+ jmp .h_start
+.h_w32:
+ mov r3, -16*2
+ jmp .h_start
+.h_w64:
+ mov r3, -16*4
+ jmp .h_start
+.h_w128:
+ mov r3, -16*8
+.h_start:
+ sub srcq, r3
+ mov r5, r3
+.h_loop:
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+r3+8*0
+ PREP_8TAP_H 1, srcq+r3+8*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ add r3, 16
+%else
+ PREP_8TAP_H 0, srcq+r3
+ mova [tmpq], m0
+ add tmpq, 16
+ add r3, 8
+%endif
+ jl .h_loop
+ add srcq, strideq
+ mov r3, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ LEA base_reg, prep%+SUFFIX
+%if ARCH_X86_32
+ mov mxd, myd
+ and mxd, 0x7f
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+%endif
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+ mova m2, [base+pw_512]
+ mova m7, [base+pw_8192]
+ punpcklwd m0, m0
+%else
+ punpcklbw m0, m0
+ psraw m0, 8
+%endif
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
+ ALLOC_STACK -mmsize*4
+ %else
+ ALLOC_STACK -mmsize*5
+ %endif
+%assign regs_used 7
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ lea r5, [strideq*3]
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
+ sub srcq, r5
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ lea stride3q, [strideq*3]
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ sub srcq, stride3q
+ cmp wd, 8
+ jns .v_w8
+%endif
+.v_w4:
+%if notcpuflag(ssse3)
+ pxor m6, m6
+ %if ARCH_X86_64
+ mova m7, [base+pw_2]
+ %endif
+%endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+ %define srcm [esp+stack_size+gprsize*1]
+ %define tmpm [esp+stack_size+gprsize*2]
+ %endif
+ mov tmpm, tmpq
+ mov srcm, srcq
+ lea r5d, [wq - 4] ; horizontal loop
+ shl r5d, (16 - 2) ; (wq / 4) << 16
+ mov r5w, hw
+.v_w4_loop0:
+%endif
+ movd m1, [srcq+strideq*0]
+ movd m0, [srcq+strideq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movd m2, [srcq+strideq*0]
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movd m3, [srcq+strideq*0]
+ movd m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+%else
+ movd m2, [srcq+strideq*2]
+ add srcq, stride3q
+ movd m4, [srcq+strideq*0]
+ movd m3, [srcq+strideq*1]
+ movd m5, [srcq+strideq*2]
+ add srcq, stride3q
+%endif
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m4 ; 2 3
+ punpckldq m4, m3 ; 3 4
+ punpckldq m3, m5 ; 4 5
+ punpckldq m5, m0 ; 5 6
+ punpcklbw m2, m4 ; 23 34
+ punpcklbw m3, m5 ; 45 56
+.v_w4_loop:
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel0
+ %define subpel0 m7
+%endif
+ mova m5, m1
+ PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel1
+ %define subpel1 m7
+%endif
+ mova m1, m2
+ PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1
+ paddw m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel2
+ %define subpel2 m7
+%endif
+ mova m2, m3
+ PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddw m5, m3
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+strideq*0]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m12, m0
+ %else
+ mova [esp+mmsize*4], m0
+ mova m7, subpel3
+ %define subpel3 m7
+ %endif
+%endif
+ mova m4, m3
+ PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3
+ paddw m5, m4
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+ SWAP m0, m12
+ %endif
+ PMULHRSW_8192 m5, m5, m7
+%else
+ mova m0, [esp+mmsize*4]
+ PMULHRSW_8192 m5, m5, [base+pw_2]
+%endif
+ movq [tmpq+wq*0], m5
+ movhps [tmpq+wq*2], m5
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov srcq, srcm
+ mov tmpq, tmpm
+ movzx hd, r5w
+ add srcq, 4
+ add tmpq, 8
+ mov srcm, srcq
+ mov tmpm, tmpq
+ sub r5d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+%endif
+ RET
+%if ARCH_X86_64
+.v_w8:
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r8, tmpq
+ lea r6d, [hq+r6*4]
+.v_w8_loop0:
+ movq m1, [srcq+strideq*0]
+ movq m2, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m4, [srcq+strideq*0]
+ movq m5, [srcq+strideq*1]
+ movq m6, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m0, [srcq+strideq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
+.v_w8_loop:
+ movq m13, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
+ paddw m15, m6
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+%else
+ mova m14, m1
+ PMADDUBSW m14, subpel0, m7, m12, 1 ; a0
+ mova m15, m2
+ PMADDUBSW m15, subpel0, m7, m12, 0 ; b0
+ mova m1, m3
+ PMADDUBSW m3, subpel1, m7, m12, 0 ; a1
+ mova m2, m4
+ PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
+ paddw m14, m3
+ mova m3, m5
+ PMADDUBSW m5, subpel2, m7, m12, 0 ; a2
+ paddw m15, m4
+ mova m4, m6
+ PMADDUBSW m6, subpel2, m7, m12, 0 ; b2
+ paddw m15, m6
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ PMADDUBSW m12, subpel3, m7, m6, 0 ; a3
+ paddw m14, m12
+ mova m6, m13
+ PMADDUBSW m13, subpel3, m7, m12, 0 ; b3
+ paddw m15, m13
+ PMULHRSW_8192 m14, m14, [base+pw_2]
+ PMULHRSW_8192 m15, m15, [base+pw_2]
+%endif
+ movu [tmpq+wq*0], m14
+ movu [tmpq+wq*2], m15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 8
+ add r8, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r8
+ sub r6d, 1<<8
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+ and mxd, 0x7f
+ movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if ARCH_X86_32
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ lea r5, [strideq*3+1]
+ sub srcq, r5
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+ ALLOC_STACK mmsize*14, 14
+ %else
+ ALLOC_STACK mmsize*14, 16
+ %endif
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ dec srcq
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8
+ %if cpuflag(ssse3)
+ mova m8, [base+pw_8192]
+ %else
+ mova m8, [base+pw_2]
+ %endif
+ mova m9, [base+pd_32]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+%if notcpuflag(ssse3)
+ punpcklbw m7, m7
+ psraw m7, 8
+%endif
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%if ARCH_X86_32
+ %if cpuflag(ssse3)
+ %define w8192reg [base+pw_8192]
+ %else
+ %define w8192reg [base+pw_2]
+ %endif
+ %define d32reg [base+pd_32]
+%else
+ %define w8192reg m8
+ %define d32reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+ mova m15, [pw_1]
+ %else
+ %define m15 m1
+ %endif
+%endif
+ movq m5, [srcq+strideq*0] ; 0 _ _ _
+ movhps m5, [srcq+strideq*1] ; 0 _ 1 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 2 _ _ _
+ movhps m4, [srcq+strideq*1] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*2]
+%else
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
+ movhps m4, [srcq+stride3q ] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*4]
+%endif
+ PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m2
+ %else
+ mova [esp+mmsize*4], m2
+ %endif
+%endif
+ ; lower shuffle
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%endif
+ movq m5, [srcq+strideq*0] ; 4 _ _ _
+ movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 6 _ _ _
+ add srcq, strideq
+%else
+ movq m4, [srcq+strideq*2] ; 6 _ _ _
+ add srcq, stride3q
+%endif
+ PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m2, m14
+ %else
+ mova m2, [esp+mmsize*4]
+ %endif
+%endif
+ ;process high
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+mmsize*4], m5
+ %define m15 m3
+ %endif
+%endif
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%endif
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+mmsize*4]
+ %endif
+%endif
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m5, 6
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+0xA0], m5
+ %endif
+%endif
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+0xA0]
+ %endif
+%endif
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m4, m5, 6
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4
+ pshufd m5, m5, q3120
+ movu [tmpq], m5
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 16
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+ shr mxd, 16
+%if ARCH_X86_32
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ %if STACK_ALIGNMENT < mmsize
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
+ mov tmpm, tmpq
+ mov stridem, strideq
+ %endif
+ %if cpuflag(ssse3)
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ %else
+ punpcklbw m1, m1
+ psraw m1, 8
+ pshufd m0, m1, q1010
+ punpckhqdq m1, m1
+ %endif
+ punpcklbw m5, m5
+ psraw m5, 8
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r5, [strideq*3+3]
+ sub srcq, r5
+ mov srcm, srcq
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ %else
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd subpelh0, m0, q1010
+ pshufd subpelh1, m0, q3232
+ mova m7, [base+pw_2]
+ %endif
+ punpcklbw m1, m1
+ psraw m1, 8
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea stride3q, [strideq*3]
+ sub srcq, 3
+ sub srcq, stride3q
+ mov r6, srcq
+ mov r8, tmpq
+%endif
+ lea r5d, [wq-4]
+ shl r5d, 14
+ add r5d, hd
+.hv_w8_loop0:
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+ %define shufA m7
+ %define shufB m8
+ %define shufC m9
+ %else
+ %define shufA [base+subpel_h_shufA]
+ %define shufB [base+subpel_h_shufB]
+ %define shufC [base+subpel_h_shufC]
+ %endif
+%endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+ PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
+ add srcq, stride3q
+ PREP_8TAP_HV m0, srcq+strideq*0, m7, m9
+%else
+ lea srcq, [srcq+strideq*2]
+ %if notcpuflag(ssse3)
+ mova [esp], m4
+ %endif
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m4
+ PREP_8TAP_HV m0, srcq+strideq*1, m7, m4
+ lea srcq, [srcq+strideq*2]
+%endif
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+%else
+ mova m7, [base+pw_2]
+ %if ARCH_X86_32
+ mova m4, [esp]
+ %endif
+%endif
+ PMULHRSW_8192 m4, m4, m7
+ PMULHRSW_8192 m5, m5, m7
+ PMULHRSW_8192 m6, m6, m7
+ PMULHRSW_8192 m0, m0, m7
+ punpcklwd m1, m4, m5 ; 01
+ punpcklwd m2, m5, m6 ; 12
+ punpcklwd m3, m6, m0 ; 23
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+%if cpuflag(ssse3)
+ mova m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+ PREP_8TAP_HV m4, srcq+strideq*1, m8, m9
+ PREP_8TAP_HV m5, srcq+strideq*2, m8, m9
+ add srcq, stride3q
+ PREP_8TAP_HV m6, srcq+strideq*0, m8, m9
+%else
+ %if notcpuflag(ssse3)
+ mova [esp+0x30], m0
+ %endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m0
+%endif
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+%elif ARCH_X86_32
+ mova m0, [esp+0x30]
+ mova m7, [base+pw_2]
+%endif
+ PMULHRSW_8192 m1, m4, m7
+ PMULHRSW_8192 m2, m5, m7
+ PMULHRSW_8192 m3, m6, m7
+ punpcklwd m4, m0, m1 ; 34
+ punpcklwd m5, m1, m2 ; 45
+ punpcklwd m6, m2, m3 ; 56
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_32]
+ paddd m0, m5
+ paddd m7, m5
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd accuv0, m1, subpelv0 ; a0
+ pmaddwd accuv1, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd accuv0, m3
+ paddd accuv1, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd accuv0, m5
+ paddd accuv1, m6
+ mova m7, [base+pd_32]
+ paddd accuv0, m7
+ paddd accuv1, m7
+ %if cpuflag(ssse3)
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+ %define shufA m5
+ %define shufB m7
+ %define shufC m6
+ %endif
+%endif
+ PREP_8TAP_HV m0, srcq+strideq*1, m5, m6
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m4, srcq+strideq*0, m5, m6
+%if cpuflag(ssse3)
+ mova m5, [base+pw_8192]
+%else
+ mova m5, [base+pw_2]
+%endif
+ PMULHRSW_8192 m0, m0, m5
+ PMULHRSW_8192 m4, m4, m5
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 67
+ punpcklwd m6, m0, m4 ; 78
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1
+ psrad m2, 6
+ psrad m1, 6
+ packssdw m2, m1
+ movq [tmpq+wq*0], m2
+ movhps [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+%if ARCH_X86_32
+ mov srcq, srcm
+ mov tmpq, tmpm
+ movzx hd, r5w
+ add srcq, 4
+ add tmpq, 8
+ mov srcm, srcq
+ mov tmpm, tmpq
+%else
+ add r6, 4
+ add r8, 8
+ movzx hd, r5b
+ mov srcq, r6
+ mov tmpq, r8
+%endif
+ sub r5d, 1<<16
+ jg .hv_w8_loop0
+ RET
+%endmacro
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
+ SWAP m%2, m%5
+ movq m%1, [srcq+ r4]
+ movq m%2, [srcq+ r6]
+ movhps m%1, [srcq+ r7]
+ movhps m%2, [srcq+ r9]
+ movq m%3, [srcq+r10]
+ movq m%4, [srcq+r11]
+ movhps m%3, [srcq+r13]
+ movhps m%4, [srcq+ rX]
+ add srcq, ssq
+ movq m%5, [srcq+ r4]
+ movq m%6, [srcq+ r6]
+ movhps m%5, [srcq+ r7]
+ movhps m%6, [srcq+ r9]
+ movq m%7, [srcq+r10]
+ movq m%8, [srcq+r11]
+ movhps m%7, [srcq+r13]
+ movhps m%8, [srcq+ rX]
+ add srcq, ssq
+ pmaddubsw m%1, m%9
+ pmaddubsw m%5, m%9
+ pmaddubsw m%2, m%10
+ pmaddubsw m%6, m%10
+ pmaddubsw m%3, m%11
+ pmaddubsw m%7, m%11
+ pmaddubsw m%4, m%12
+ pmaddubsw m%8, m%12
+ phaddw m%1, m%2
+ phaddw m%5, m%6
+ phaddw m%3, m%4
+ phaddw m%7, m%8
+ phaddw m%1, m%3
+ phaddw m%5, m%7
+ pmulhrsw m%1, m12
+ pmulhrsw m%5, m12
+ SWAP m%2, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
+ %if %3 == 1
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ %endif
+ movq m0, [srcq+r0]
+ movq m1, [srcq+rX]
+ movhps m0, [srcq+r4]
+ movhps m1, [srcq+r5]
+ add srcq, ssq
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ sub srcq, ssq
+ movq m2, [srcq+r0]
+ movq m3, [srcq+rX]
+ movhps m2, [srcq+r4]
+ movhps m3, [srcq+r5]
+ add srcq, ssq
+ movq m6, [srcq+r0]
+ movq m7, [srcq+rX]
+ movhps m6, [srcq+r4]
+ movhps m7, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m0, [esp+%1+ 0]
+ pmaddubsw m4, [esp+%1+ 0]
+ pmaddubsw m1, [esp+%1+16]
+ pmaddubsw m5, [esp+%1+16]
+ pmaddubsw m2, [esp+%1+32]
+ pmaddubsw m6, [esp+%1+32]
+ pmaddubsw m3, [esp+%1+48]
+ pmaddubsw m7, [esp+%1+48]
+ phaddw m0, m1
+ phaddw m4, m5
+ phaddw m2, m3
+ phaddw m6, m7
+ phaddw m0, m2
+ phaddw m4, m6
+ pmulhrsw m0, m12
+ pmulhrsw m4, m12
+ %if %2 != 0
+ mova [esp+%2+ 0], m0
+ mova [esp+%2+16], m4
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else ; prep
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %define tmp_stridem dword [esp+0x138]
+ %endif
+ %define rndshift 6
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if !isprep && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x09c]
+ %define dym [esp+0x21c]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+ mov ssd, ssm
+%endif
+ LEA base_reg, %1_8tap_scaled_8bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_32
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isprep && UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x94]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m12, [base+pw_8192]
+ %ifidn %1, put
+ mova m13, [base+pd_512]
+ %else
+ mova m13, [base+pd_32]
+ %endif
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m12 [base+pw_8192]
+ %ifidn %1, put
+ %define m13 [base+pd_512]
+ %else
+ %define m13 [base+pd_32]
+ %endif
+%endif
+ pxor m9, m9
+%if ARCH_X86_64
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssq*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ mov r1, r1m
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %else
+ %define m11 [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [rsp+0x180], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ %endif
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ movhps m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %else
+ pand m7, m8, m11
+ pandn m8, m15
+ %define m8 m6
+ %define m15 m5
+ por m15, m7
+ mova [rsp+0x190], m15
+ %endif
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 1 2 3
+ pmulhrsw m1, m12 ; 4 5 6 7
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ pmaddwd m8, m4, m11
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov mym, myd
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r1, [r1+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r1*8+0]
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %endif
+ paddd m5, m13
+ paddd m5, m7
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ %if ARCH_X86_64
+ pextrw r6d, m5, 0
+ mov [dstq], r6w
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ %else
+ pextrw r3d, m5, 0
+ mov [dstq], r3w
+ add dstq, dsm
+ dec hd
+ jz .ret
+ mov myd, mym
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [rsp+0x1a0]
+ mova m0, [rsp+0x1b0]
+ mova m2, [rsp+0x1c0]
+ mova m4, [rsp+0x1d0]
+ %define m14 [esp+0x180]
+ %define m15 [esp+0x190]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movq m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movhps m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12 ; 6 7 6 7
+ palignr m4, m5, m1, 8 ; 4 5 6 7
+ pshufd m5, m4, q0321 ; 5 6 7 _
+ mova m1, m4
+ punpcklwd m2, m4, m5 ; 45 56
+ punpckhwd m4, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%else
+ %define m11 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd rX, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m15 m1
+%endif
+ mova m5, [base+bdct_lb_dw]
+ movq m6, [base+subpel_s_shuf2]
+ psrld m14, 10
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+ punpcklqdq m6, m6
+ pshufb m14, m5
+ paddb m14, m6
+%if ARCH_X86_64
+ pcmpeqd m0, m9
+ pand m11, m0
+%else
+ mova [esp+0x180], m14
+ SWAP m7, m4
+ pxor m3, m3
+ pcmpeqd m0, m3
+ pand m2, m11, m0
+ %define m11 m2
+%endif
+ pandn m0, m15
+%if ARCH_X86_64
+ SWAP m15, m0
+%else
+ %define m15 m0
+%endif
+ por m15, m11
+%if ARCH_X86_64
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m3, m14
+ pshufb m5, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ pmaddubsw m2, m15
+ pmaddubsw m4, m15
+ pmaddubsw m3, m15
+ pmaddubsw m5, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ phaddw m9, m2, m4
+ phaddw m3, m5
+ pmulhrsw m7, m12 ; 0 1
+ pmulhrsw m8, m12 ; 2 3
+ pmulhrsw m9, m12 ; 4 5
+ pmulhrsw m3, m12 ; 6 7
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ psrldq m11, m3, 8 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m11 ; 67
+ mova [rsp+0x00], m7
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+%else
+ mova [esp+0x190], m15
+ lea ss3q, [ssq*3]
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ movu m6, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ phaddw m2, m3
+ phaddw m7, m6
+ movu m1, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m6, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m1, m14
+ pshufb m5, m14
+ pshufb m3, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m5, m15
+ pmaddubsw m3, m15
+ pmaddubsw m6, m15
+ phaddw m1, m5
+ phaddw m3, m6
+ pmulhrsw m2, m12
+ pmulhrsw m7, m12
+ pmulhrsw m1, m12
+ pmulhrsw m3, m12
+ shufps m4, m2, m7, q1032 ; 1 2
+ shufps m5, m7, m1, q1032 ; 3 4
+ shufps m6, m1, m3, q1032 ; 5 6
+ psrldq m0, m3, 8 ; 7 _
+ mova [esp+0x1a0], m0
+ %define m11 [esp+0x1a0]
+ punpcklwd m0, m2, m4 ; 01
+ punpckhwd m2, m4 ; 12
+ punpcklwd m4, m7, m5 ; 23
+ punpckhwd m7, m5 ; 34
+ punpcklwd m5, m1, m6 ; 45
+ punpckhwd m1, m6 ; 56
+ punpcklwd m3, [esp+0x1a0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [esp+0x1b0], m0 ; 01
+ mova [esp+0x1c0], m4 ; 23
+ mova [esp+0x1d0], m5 ; 45
+ mova [esp+0x1e0], m3 ; 67
+ mova [rsp+0x00], m2 ; 12
+ mova [rsp+0x10], m7 ; 34
+ mova [rsp+0x20], m1 ; 56
+ SWAP m1, m4
+ SWAP m2, m5
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m10
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, m13
+ paddd m0, m2
+ SWAP m4, m0
+%endif
+ psrad m4, rndshift
+ packssdw m4, m4
+%ifidn %1, put
+ packuswb m4, m4
+ movd [dstq], m4
+ add dstq, dsmp
+%else
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [esp+0x1b0]
+ mova m1, [esp+0x1c0]
+ mova m2, [esp+0x1d0]
+ mova m3, [esp+0x1e0]
+ jmp .w4_loop
+.w4_next_line:
+ %define m14 [esp+0x180]
+ %define m15 [esp+0x190]
+%endif
+ movu m4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+%if ARCH_X86_64
+ mova m0, [rsp+0x00]
+ mova [rsp+0x00], m1
+ mova m1, [rsp+0x10]
+ mova [rsp+0x10], m2
+ mova m2, [rsp+0x20]
+ mova [rsp+0x20], m3
+%else
+ mova m5, [esp+0x1c0]
+ mova m0, [rsp+0x000]
+ mova [rsp+0x00], m5
+ mova [esp+0x1b0], m0
+ mova m6, [esp+0x1d0]
+ mova m1, [rsp+0x010]
+ mova [rsp+0x10], m6
+ mova [esp+0x1c0], m1
+ mova m7, [esp+0x1e0]
+ mova m2, [rsp+0x020]
+ mova [rsp+0x20], m7
+ mova [esp+0x1d0], m2
+%endif
+ pshufb m4, m14
+ pmaddubsw m4, m15
+ phaddw m4, m4
+ pmulhrsw m4, m12
+ punpcklwd m3, m11, m4
+%if ARCH_X86_32
+ mova [esp+0x1e0], m3
+%endif
+ mova m11, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+%if ARCH_X86_32
+ mova m0, [esp+0x1c0]
+ mova m1, [esp+0x1d0]
+ mova m2, [esp+0x1e0]
+%endif
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m6, [rsp+0x10]
+ mova m7, [rsp+0x20]
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m4, m5
+ pmulhrsw m4, m12
+ punpcklwd m5, m11, m4
+ mova [rsp+0x00], m6
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m5
+%if ARCH_X86_64
+ psrldq m11, m4, 8
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m11
+%else
+ psrldq m6, m4, 8
+ punpcklwd m3, m4, m6
+ mova [esp+0x1a0], m6
+ mova [esp+0x1b0], m0
+ mova [esp+0x1c0], m1
+ mova [esp+0x1d0], m2
+ mova [esp+0x1e0], m3
+%endif
+ jmp .w4_loop
+INIT_XMM ssse3
+.w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 3
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [esp+0x094], myd
+ mov [esp+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [esp+0x130], 8*(isprep+1)
+ mov myd, [esp+0x094]
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ pxor m9, m9
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m11, m4
+ pand m8, m11, m6
+ pand m15, m11, m14
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m11, m5
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m8
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m9, [rsp+0x80]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ SWAP m14, m8
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+ pmaddwd m8, [rsp+0x70], m11
+ pmaddwd m9, [rsp+0x80], m11
+ paddd m4, m6
+ paddd m5, m7
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
+ mova m5, [esp+0x180]
+ mova m6, [esp+0x190]
+ mova m7, [esp+0x1a0]
+ mova m0, [esp+0x1b0]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x180], m4
+ mova [esp+0x190], m5
+ mova [esp+0x1a0], m6
+ mova [esp+0x1b0], m7
+ mova m1, [esp+0x140]
+ mova m2, [esp+0x150]
+ mova m3, [esp+0x160]
+ mova m4, [esp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [esp+0x180], m6
+ pmaddwd m3, [esp+0x190], m6
+ pmaddwd m4, [esp+0x1a0], m7
+ pmaddwd m5, [esp+0x1b0], m7
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m13
+ paddd m1, m13
+ paddd m4, m0
+ paddd m5, m1
+%endif
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x140], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ add srcq, ssq
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ pshufd m9, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m9 ; 3a 2a
+ pshufb m3, m9 ; 3b 2b
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ phaddw m6, m7
+ phaddw m4, m5
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x50], m14 ; 4a 5a
+ pshufb m6, [rsp+0x60], m14 ; 4b 5b
+ pshufb m7, [rsp+0x70], m9 ; 7a 6a
+ pshufb m8, [rsp+0x80], m9 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m8
+ jmp .vloop
+.skip_line:
+ mova m0, [rsp+0x10]
+ mova m1, [rsp+0x20]
+ mova m14, [rsp+0x30]
+ mova m15, [rsp+0x40]
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [rsp+0x50] ; 23a
+ mova m3, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m6, [rsp+0x80] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ mova m2, [esp+0x160]
+ mova m3, [esp+0x170]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ jz .skip_line
+ mova m6, [base+unpckw]
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ mova m7, [esp+0x180]
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ pshufb m0, m6 ; 0a 1a
+ pshufb m1, m6 ; 0b 1b
+ pshufb m7, m6 ; 4a 5a
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ movq m3, [srcq+r0]
+ movq m2, [srcq+rX]
+ movhps m3, [srcq+r4]
+ movhps m2, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m4, [esp+0x20]
+ pmaddubsw m5, [esp+0x30]
+ pmaddubsw m3, [esp+0x40]
+ pmaddubsw m2, [esp+0x50]
+ phaddw m4, m5
+ phaddw m3, m2
+ mova m5, [esp+0x190]
+ mova m2, [esp+0x160]
+ phaddw m4, m3
+ mova m3, [esp+0x170]
+ pmulhrsw m4, m12 ; 8a 8b
+ mov myd, mym
+ pshufb m5, m6 ; 4b 5b
+ pshufd m6, m6, q1032
+ pshufb m2, m6 ; 3a 2a
+ pshufb m3, m6 ; 3b 2b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova m0, [esp+0x1a0]
+ mova m1, [esp+0x1b0]
+ punpcklwd m2, m7 ; 34a
+ punpcklwd m3, m5 ; 34b
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+ pshufb m0, m6 ; 7a 6a
+ pshufb m1, m6 ; 7b 6b
+ punpckhwd m7, m0 ; 56a
+ punpckhwd m5, m1 ; 56b
+ punpcklwd m0, m4
+ punpckhqdq m4, m4
+ punpcklwd m1, m4
+ mova [esp+0x180], m7
+ mova [esp+0x190], m5
+ mova [esp+0x1a0], m0
+ mova [esp+0x1b0], m1
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0x20, 0x1c0, 0
+ mov myd, mym
+ mova m0, [esp+0x160]
+ mova m1, [esp+0x170]
+ mova m2, [esp+0x180]
+ mova m3, [esp+0x190]
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova m4, [esp+0x1a0]
+ mova m5, [esp+0x1b0]
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+ mova m6, [esp+0x1c0]
+ mova m7, [esp+0x1d0]
+ mova [esp+0x180], m4
+ mova [esp+0x190], m5
+ punpcklwd m4, m6, m7
+ punpckhwd m6, m7
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m6
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ movzx r5, byte [esp+0x1f0]
+ dec srcd
+ movd m15, r5
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %else
+ %define m11 [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [esp+0x00], m14
+ %define m14 [esp+0x00]
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m10, r4
+ %else
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ %define m10 m4
+ movd m10, r4
+ movd m3, r3
+ mov r3, r3m
+ punpckldq m10, m3
+ %endif
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ add srcq, ss3q
+ punpcklbw m10, m10
+ psraw m10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ %else
+ pand m7, m11, m8
+ %define m11 m7
+ %endif
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %if ARCH_X86_64
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ %else
+ mova [esp+0x10], m15
+ %define m15 [esp+0x10]
+ mov r0, r0m
+ pshufd m5, m4, q0000
+ pshufd m6, m4, q1111
+ pshufd m7, m4, q2222
+ pshufd m4, m4, q3333
+ %define m8 [esp+0x20]
+ %define m9 [esp+0x30]
+ %define m10 [esp+0x40]
+ %define m11 [esp+0x50]
+ mova m8, m5
+ mova m9, m6
+ mova m10, m7
+ mova m11, m4
+ %endif
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ palignr m2, m1, m0, 4
+ pshufd m4, m1, q2121
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ punpcklwd m2, m1, m4 ; 45 56
+.dy1_w2_loop:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ mova m3, m0
+ mova m0, m2
+ paddd m5, m13
+ paddd m6, m7
+ pshufb m1, m14
+ pmaddubsw m1, m15
+ phaddw m1, m1
+ pmulhrsw m1, m12
+ palignr m7, m1, m4, 12
+ punpcklwd m2, m7, m1 ; 67 78
+ pmaddwd m7, m2, m11
+ mova m4, m1
+ paddd m5, m6
+ paddd m5, m7
+ psrad m5, rndshift
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+%else
+ movd r1, m15
+ movd r3, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ %define m15 m5
+ SWAP m4, m7
+ movd m15, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m4, [base+subpel_filters+r5*8+2]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+%endif
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+ movq m6, [base+subpel_s_shuf2]
+%if ARCH_X86_64
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpcklqdq m6, m6
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ add srcq, ss3q
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m10, r4q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ pmaddubsw m7, m15
+ phaddw m0, m1
+ phaddw m2, m3
+ phaddw m4, m5
+ phaddw m6, m7, m7
+ pmulhrsw m0, m12 ; 0 1
+ pmulhrsw m2, m12 ; 2 3
+ pmulhrsw m4, m12 ; 4 5
+ pmulhrsw m6, m12 ; 6 _
+ shufps m1, m0, m2, q1032 ; 1 2
+ shufps m3, m2, m4, q1032 ; 3 4
+ shufps m5, m4, m6, q1032 ; 5 6
+ punpcklwd m7, m0, m1 ; 01
+ punpckhwd m0, m1 ; 12
+ punpcklwd m8, m2, m3 ; 23
+ punpckhwd m2, m3 ; 34
+ punpcklwd m9, m4, m5 ; 45
+ punpckhwd m4, m5 ; 56
+%else
+ pxor m3, m3
+ pcmpeqd m8, m3
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ add srcq, ss3q
+ punpcklqdq m6, m6
+ SWAP m4, m7
+ pand m7, m11, m8
+ pandn m8, m15
+ SWAP m5, m0
+ por m15, m7
+ paddb m14, m6
+ movu m0, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m0, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ mova [esp+0x00], m14
+ mova [esp+0x10], m15
+ pmaddubsw m0, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ phaddw m1, m2
+ movu m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ mov r0, r0m
+ phaddw m3, m0
+ pshufb m2, m14
+ pmaddubsw m2, m15
+ %define m14 [esp+0x00]
+ %define m15 [esp+0x10]
+ phaddw m7, m6
+ phaddw m2, m2
+ movd m6, r4
+ movd m0, r5
+ punpckldq m6, m0
+ punpcklbw m6, m6
+ psraw m6, 8
+ mova [esp+0x20], m6
+ pmulhrsw m1, m12 ; 0 1
+ pmulhrsw m3, m12 ; 2 3
+ pmulhrsw m7, m12 ; 4 5
+ pmulhrsw m2, m12 ; 6 _
+ shufps m0, m1, m3, q1032 ; 1 2
+ shufps m4, m3, m7, q1032 ; 3 4
+ shufps m5, m7, m2, q1032 ; 5 6
+ punpcklwd m6, m1, m0 ; 01
+ punpckhwd m1, m0 ; 12
+ mova [esp+0x30], m1
+ punpcklwd m1, m3, m4 ; 23
+ punpckhwd m3, m4 ; 34
+ mova [esp+0x40], m3
+ punpcklwd m3, m7, m5 ; 45
+ punpckhwd m7, m5 ; 56
+ mova [esp+0x50], m7
+ mova [esp+0x60], m2
+ mova m0, [esp+0x20]
+ %xdefine m8 m1
+ %xdefine m9 m3
+ %xdefine m10 m0
+ SWAP m7, m6
+ SWAP m1, m4
+ SWAP m3, m2
+%endif
+ pshufd m1, m10, q0000
+ pshufd m3, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_64
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+%else
+ mova [esp+0x70], m8
+ mova [esp+0x80], m9
+ mova [esp+0x90], m1
+ mova [esp+0xa0], m3
+ mova [esp+0xb0], m5
+ mova [esp+0xc0], m10
+ %ifidn %1, put
+ mov dsd, dsm
+ %endif
+ %define m11 m6
+%endif
+.dy1_w4_loop:
+%if ARCH_X86_64
+ movu m11, [srcq+ssq*0]
+ pmaddwd m7, m1
+ pmaddwd m8, m3
+ pmaddwd m0, m1
+ pmaddwd m2, m3
+ pmaddwd m9, m5
+ pmaddwd m4, m5
+ paddd m7, m8
+ paddd m0, m2
+ movu m8, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m11, m14
+ pmaddubsw m11, m15
+ paddd m7, m13
+ paddd m0, m13
+ paddd m7, m9
+ paddd m0, m4
+ pshufb m8, m14
+ pmaddubsw m8, m15
+ phaddw m11, m8
+ mova m8, [rsp+0x20]
+ pmulhrsw m11, m12
+ punpcklwd m9, m6, m11 ; 67
+ psrldq m6, m11, 8
+ punpcklwd m4, m11, m6 ; 78
+ pmaddwd m2, m9, m10
+ pmaddwd m11, m4, m10
+ paddd m7, m2
+ mova m2, [rsp+0x30]
+ paddd m0, m11
+%else
+ SWAP m7, m6
+ SWAP m1, m4
+ SWAP m3, m2
+ movu m5, [srcq+ssq*0]
+ mova m0, [esp+0x30]
+ mova m2, [esp+0x40]
+ mova m4, [esp+0x50]
+ pmaddwd m6, [esp+0x90]
+ pmaddwd m1, [esp+0xa0]
+ pmaddwd m0, [esp+0x90]
+ pmaddwd m2, [esp+0xa0]
+ pmaddwd m3, [esp+0xb0]
+ pmaddwd m4, [esp+0xb0]
+ paddd m6, m1
+ paddd m0, m2
+ movu m7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ paddd m6, m13
+ paddd m0, m13
+ paddd m6, m3
+ paddd m0, m4
+ pshufb m7, m14
+ pmaddubsw m7, m15
+ phaddw m5, m7
+ mova m7, [rsp+0x80]
+ pmulhrsw m5, m12
+ punpcklwd m3, [esp+0x60], m5 ; 67
+ psrldq m1, m5, 8
+ punpcklwd m4, m5, m1 ; 78
+ pmaddwd m2, m3, [esp+0xc0]
+ pmaddwd m5, m4, [esp+0xc0]
+ mova [esp+0x60], m1
+ paddd m6, m2
+ mova m2, [esp+0x50]
+ paddd m0, m5
+ SWAP m7, m6
+%endif
+ psrad m7, rndshift
+ psrad m0, rndshift
+ packssdw m7, m0
+%if ARCH_X86_64
+ mova m0, [rsp+0x10]
+%else
+ mova m0, [esp+0x40]
+%define m11 m5
+%endif
+%ifidn %1, put
+ packuswb m7, m7
+ psrldq m11, m7, 4
+ movd [dstq+dsq*0], m7
+ movd [dstq+dsq*1], m11
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m7
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jz .ret
+%if ARCH_X86_64
+ mova m7, [rsp+0x00]
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+%else
+ mova m7, [esp+0x70] ; 01
+ mova m1, [esp+0x80] ; 23
+ mova m2, [esp+0x50] ; 34
+ mova [esp+0x30], m0
+ mova [esp+0x70], m1
+ mova [esp+0x40], m2
+ mova [esp+0x80], m3
+ mova [esp+0x50], m4
+%endif
+ jmp .dy1_w4_loop
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define m8 m0
+ %define m9 m1
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ sub srcq, 3
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ punpcklbw m5, m5
+ psraw m5, 8
+ SWAP m3, m5
+%endif
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ SWAP m5, m3
+ mov r5, hm
+ mov [esp+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [rsp+0x130], 8*(isprep+1)
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%else
+ %define m10 [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+ pxor m9, m9
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ mova m14, [base+unpckw]
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7
+ mova m5, [esp+0x1a0]
+ mova m6, [esp+0x1b0]
+ mova m7, [esp+0x1c0]
+ mova m0, [esp+0x1d0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6
+ mova [esp+0x1d0], m7
+ mova m1, [esp+0x060]
+ mova m2, [esp+0x070]
+ mova m3, [esp+0x180]
+ mova m4, [esp+0x190]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x060], m0
+ mova [esp+0x070], m1
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ %define m8 [esp+0x140]
+ %define m9 [esp+0x150]
+ %define m10 [esp+0x160]
+ %define m11 [esp+0x170]
+%endif
+.dy1_vloop:
+%if ARCH_X86_32
+ mov r0, r0m
+%endif
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+%else
+ pmaddwd m6, [rsp+0x1a0], m10
+ pmaddwd m7, [rsp+0x1b0], m10
+%endif
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x70], m11
+ pmaddwd m7, [rsp+0x80], m11
+%else
+ pmaddwd m6, [rsp+0x1c0], m11
+ pmaddwd m7, [rsp+0x1d0], m11
+%endif
+ paddd m4, m6
+ paddd m5, m7
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+%if ARCH_X86_32
+ mov r0m, r0
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ add srcq, ssq
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ phaddw m4, m5
+ phaddw m6, m7
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x70], m15 ; 7a 6a
+ pshufb m7, [rsp+0x80], m15 ; 7b 6b
+ pshufb m6, [rsp+0x50], m14 ; 4a 5a
+ pshufb m15, [rsp+0x60], m14 ; 4b 5b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m15 ; 34b
+ punpckhwd m6, m5 ; 56a
+ punpckhwd m15, m7 ; 56b
+ punpcklwd m5, m4 ; 78a
+ psrldq m4, 8
+ punpcklwd m7, m4 ; 78b
+ mova [rsp+0x50], m6
+ mova [rsp+0x60], m15
+ mova [rsp+0x70], m5
+ mova [rsp+0x80], m7
+%else
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova m6, [base+unpckw]
+ mova m0, [esp+0x060]
+ mova m1, [esp+0x070]
+ mova m7, [esp+0x1a0]
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ pshufb m0, m6 ; 0a 1a
+ pshufb m1, m6 ; 0b 1b
+ pshufb m7, m6 ; 4a 5a
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ movq m3, [srcq+r0]
+ movq m2, [srcq+rX]
+ movhps m3, [srcq+r4]
+ movhps m2, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m4, [esp+0x20]
+ pmaddubsw m5, [esp+0x30]
+ pmaddubsw m3, [esp+0x40]
+ pmaddubsw m2, [esp+0x50]
+ phaddw m4, m5
+ phaddw m3, m2
+ mova m5, [esp+0x1b0]
+ mova m2, [esp+0x180]
+ phaddw m4, m3
+ mova m3, [esp+0x190]
+ pmulhrsw m4, m12 ; 8a 8b
+ pshufb m5, m6 ; 4b 5b
+ pshufd m6, m6, q1032
+ pshufb m2, m6 ; 3a 2a
+ pshufb m3, m6 ; 3b 2b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ mova [esp+0x60], m0
+ mova [esp+0x70], m1
+ mova m0, [esp+0x1c0]
+ mova m1, [esp+0x1d0]
+ punpcklwd m2, m7 ; 34a
+ punpcklwd m3, m5 ; 34b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ pshufb m0, m6 ; 7a 6a
+ pshufb m1, m6 ; 7b 6b
+ punpckhwd m7, m0 ; 56a
+ punpckhwd m5, m1 ; 56b
+ punpcklwd m0, m4
+ punpckhqdq m4, m4
+ punpcklwd m1, m4
+ mova [esp+0x1a0], m7
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m0
+ mova [esp+0x1d0], m1
+ mova m0, [esp+0x60]
+ mova m1, [esp+0x70]
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ movzx r5, byte [esp+0x1f0]
+ dec srcd
+ movd m15, r5
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [esp+0x00], m14
+ %define m14 [esp+0x00]
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ movhps m0, [srcq+ssq*2]
+ movhps m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m10, r4q
+ %else
+ mov myd, mym
+ mov r3, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r3, r3m
+ %define m10 m4
+ movd m10, r4
+ movd m3, r5
+ punpckldq m10, m3
+ %endif
+ movq m3, [srcq+ssq*0]
+ movhps m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m10, m10
+ psraw m10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ %else
+ pand m7, m11, m8
+ %define m11 m7
+ %endif
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %if ARCH_X86_64
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ %else
+ mova [esp+0x10], m15
+ %define m15 [esp+0x10]
+ mov r5, r0m
+ %define dstq r5
+ mov dsd, dsm
+ pshufd m5, m4, q0000
+ pshufd m6, m4, q1111
+ pshufd m7, m4, q2222
+ pshufd m4, m4, q3333
+ %define m8 [esp+0x20]
+ %define m9 [esp+0x30]
+ %define m10 [esp+0x40]
+ %define m11 [esp+0x50]
+ mova m8, m5
+ mova m9, m6
+ mova m10, m7
+ mova m11, m4
+ %endif
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pslldq m2, m3, 8
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m2, m0, q3110 ; 0 2 2 4
+ pshufd m1, m1, q3110 ; 1 3 3 5
+ punpcklwd m3, m2, m1 ; 01 23
+ punpckhwd m2, m1 ; 23 45
+.dy2_w2_loop:
+ movq m6, [srcq+ssq*0]
+ movq m7, [srcq+ssq*1]
+ movhps m6, [srcq+ssq*2]
+ movhps m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m3, m8
+ pmaddwd m5, m2, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ phaddw m6, m7
+ pmulhrsw m6, m12
+ psrldq m7, m6, 8
+ palignr m6, m0, 8
+ palignr m7, m1, 8
+ mova m0, m6
+ mova m1, m7
+ pshufd m6, m6, q3221
+ pshufd m7, m7, q3221
+ punpcklwd m3, m6, m7 ; 45 67
+ punpckhwd m2, m6, m7 ; 67 89
+ pmaddwd m6, m3, m10
+ pmaddwd m7, m2, m11
+ paddd m4, m5
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ packssdw m4, m4
+ packuswb m4, m4
+ movd r4d, m4
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %define dstq r0
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+%else
+ movd r1, m15
+ movd r3, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ %define m15 m5
+ SWAP m4, m7
+ movd m15, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m4, [base+subpel_filters+r5*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ mov myd, mym
+ mov r3, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+%endif
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+%if ARCH_X86_64
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m1, [srcq+ssq*1]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpcklqdq m6, m6
+ pshufb m14, [base+bdct_lb_dw]
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m11, r4q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ phaddw m4, m5
+ pmulhrsw m0, m12 ; 0 2
+ pmulhrsw m1, m12 ; 1 3
+ pmulhrsw m4, m12 ; 4 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+%else
+ pxor m3, m3
+ pcmpeqd m8, m3
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ssq*1]
+ add srcq, ss3q
+ punpcklqdq m6, m6
+ SWAP m4, m7
+ pand m7, m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m7
+ paddb m14, m6
+ movu m0, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m0, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ mova [esp+0x00], m14
+ mova [esp+0x10], m15
+ pmaddubsw m0, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ %define m14 [esp+0x00]
+ %define m15 [esp+0x10]
+ phaddw m1, m2
+ phaddw m3, m0
+ phaddw m7, m6
+ %ifidn %1, put
+ mov dsd, dsm
+ %define dstq r5
+ %else
+ %define tmpq r5
+ %endif
+ movd m6, r4
+ movd m0, r5
+ punpckldq m6, m0
+ punpcklbw m6, m6
+ psraw m6, 8
+ mov r5, r0m
+ pmulhrsw m1, m12 ; 0 2
+ pmulhrsw m3, m12 ; 1 3
+ pmulhrsw m7, m12 ; 4 5
+ SWAP m0, m1, m3
+ SWAP m4, m7
+ pshufd m2, m6, q0000
+ pshufd m3, m6, q1111
+ pshufd m7, m6, q2222
+ pshufd m6, m6, q3333
+ mova [esp+0x30], m2
+ mova [esp+0x40], m3
+ mova [esp+0x50], m7
+ mova [esp+0x60], m6
+ %define m8 [esp+0x30]
+ %define m9 [esp+0x40]
+ %define m10 [esp+0x50]
+ %define m11 [esp+0x60]
+%endif
+ psrldq m5, m4, 8 ; 5 _
+ punpckhwd m2, m0, m1 ; 23
+ punpcklwd m0, m1 ; 01
+ punpcklwd m4, m5 ; 45
+.dy2_w4_loop:
+ pmaddwd m0, m8 ; a0
+ pmaddwd m5, m2, m8 ; b0
+ pmaddwd m2, m9 ; a1
+ pmaddwd m7, m4, m9 ; b1
+ pmaddwd m3, m4, m10 ; a2
+ paddd m0, m13
+ paddd m5, m13
+ paddd m0, m2
+ paddd m5, m7
+ paddd m0, m3
+ movu m6, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m6, m14
+ pshufb m7, m14
+ pshufb m3, m14
+ pshufb m1, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ pmaddubsw m3, m15
+ pmaddubsw m1, m15
+ phaddw m6, m7
+ phaddw m3, m1
+ pmulhrsw m6, m12 ; 6 7
+ pmulhrsw m3, m12 ; 8 9
+ psrldq m7, m6, 8
+ psrldq m1, m3, 8
+ punpcklwd m6, m7 ; 67
+ punpcklwd m3, m1 ; 89
+ mova m2, m6
+ pmaddwd m1, m6, m10 ; b2
+ pmaddwd m6, m11 ; a3
+ pmaddwd m7, m3, m11 ; b3
+ paddd m5, m1
+ paddd m0, m6
+ paddd m5, m7
+ psrad m0, rndshift
+ psrad m5, rndshift
+ packssdw m0, m5
+%ifidn %1, put
+ packuswb m0, m0
+ psrldq m1, m0, 4
+ movd [dstq+dsq*0], m0
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m0
+ add tmpq, 16
+%endif
+ mova m0, m4
+ mova m4, m3
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %define m9 m1
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define tmpq r0
+ %define ssq ssm
+ %else
+ %define dstq r0
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ sub srcq, 3
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ punpcklbw m5, m5
+ psraw m5, 8
+ SWAP m3, m5
+%endif
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ SWAP m5, m3
+ mov r5, hm
+ mov [esp+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [rsp+0x130], 8*(isprep+1)
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%else
+ %define m10 [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+ pxor m9, m9
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7
+ mova m5, [esp+0x1a0]
+ mova m6, [esp+0x1b0]
+ mova m7, [esp+0x1c0]
+ mova m0, [esp+0x1d0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6
+ mova [esp+0x1d0], m7
+ mova m1, [esp+0x060]
+ mova m2, [esp+0x070]
+ mova m3, [esp+0x180]
+ mova m4, [esp+0x190]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ %define m8 [esp+0x140]
+ %define m9 [esp+0x150]
+ %define m10 [esp+0x160]
+ %define m11 [esp+0x170]
+%endif
+.dy2_vloop:
+%if ARCH_X86_32
+ mov r0, r0m
+%endif
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+%else
+ pmaddwd m6, [esp+0x1a0], m10
+ pmaddwd m7, [esp+0x1b0], m10
+%endif
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x70], m11
+ pmaddwd m7, [rsp+0x80], m11
+%else
+ pmaddwd m6, [esp+0x1c0], m11
+ pmaddwd m7, [esp+0x1d0], m11
+%endif
+ paddd m4, m6
+ paddd m5, m7
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+%if ARCH_X86_32
+ mov r0m, r0
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ mova m8, [rsp+0x10]
+ mova m9, [rsp+0x20]
+ mova m10, [rsp+0x30]
+ mova m11, [rsp+0x40]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
+ mova m3, [rsp+0x50] ; 23a
+ mova m4, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m7, [rsp+0x80] ; 45b
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m14, m2, m6 ; 67a
+ punpckhwd m2, m6 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m7
+ mova [rsp+0x70], m14
+ mova [rsp+0x80], m2
+ mova m2, m3
+ mova m3, m4
+%else
+ MC_8TAP_SCALED_H 0x20, 0
+ punpcklwd m6, m0, m4
+ punpckhwd m7, m0, m4
+ mova m0, [esp+0x180] ; 01a
+ mova m1, [esp+0x190] ; 01b
+ mova m2, [rsp+0x1a0] ; 23a
+ mova m3, [esp+0x1b0] ; 23b
+ mova m4, [esp+0x1c0] ; 45a
+ mova m5, [esp+0x1d0] ; 45b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6 ; 67a
+ mova [esp+0x1d0], m7 ; 67b
+%endif
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+ mov alpham, alphad
+ mov betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+ mov deltam, deltad
+ mov gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+ mov mym, myd
+ mov alphad, alpham
+ mov betad, betam
+ mov mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+ mov mxm, mxd
+ mov deltad, deltam
+ mov gammad, gammam
+ mov myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 8*4
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r3, r3m
+ mov r5, r5m
+ mov dstm, r0
+ mov dsm, r1
+ mov srcm, r2
+ mov ssm, r3
+ mov mxm, r5
+ mov r0, r6m
+ mov mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+ pblendw %1, %2, 0xAA
+ %else
+ pand %2, m10
+ por %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+ %if ARCH_X86_32
+ %define m8 m4
+ %define m9 m5
+ %define m14 m6
+ %define m15 m7
+ %define m11 m7
+ %endif
+ %if notcpuflag(ssse3) || ARCH_X86_32
+ pxor m11, m11
+ %endif
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m2, [filterq+myq *8] ; a
+ movq m8, [filterq+tmp1q*8] ; e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; b
+ movq m0, [filterq+tmp1q*8] ; f
+ punpcklwd m2, m3
+ punpcklwd m8, m0
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m0, [filterq+myq *8] ; c
+ movq m9, [filterq+tmp1q*8] ; g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; d
+ movq m1, [filterq+tmp1q*8] ; h
+ punpcklwd m0, m3
+ punpcklwd m9, m1
+ punpckldq m1, m2, m0
+ punpckhdq m2, m0
+ punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m0, %3
+ pmaddwd m3, %5
+ pmaddwd m1, %7
+ pmaddwd m14, %9
+ paddd m0, m3
+ paddd m1, m14
+ paddd m0, m1
+ mova %1, m0
+ %if ARCH_X86_64
+ SWAP m3, m14
+ %endif
+ punpckldq m0, m8, m9
+ punpckhdq m8, m9
+ punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+ punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+ punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+ punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+ pmaddwd m1, %4
+ pmaddwd m14, %6
+ pmaddwd m2, %8
+ pmaddwd m15, %10
+ paddd m1, m14
+ paddd m2, m15
+ paddd m1, m2
+ mova %2, m1
+ %if ARCH_X86_64
+ SWAP m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+ %define counterd dword r4m
+ %else
+ %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+ %define tmpm [esp+stack_size-4*1]
+ %define tsm [esp+stack_size-4*2]
+ %endif
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(ssse3)
+ psrad m12, 13
+ psrad m13, 13
+ psrad m14, 13
+ psrad m15, 13
+ packssdw m12, m13
+ packssdw m14, m15
+ mova m13, [PIC_sym(pw_8192)]
+ pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
+ pmulhrsw m14, m13
+%else
+ %if ARCH_X86_32
+ %define m10 m0
+ %endif
+ mova m10, [PIC_sym(pd_16384)]
+ paddd m12, m10
+ paddd m13, m10
+ paddd m14, m10
+ paddd m15, m10
+ psrad m12, 15
+ psrad m13, 15
+ psrad m14, 15
+ psrad m15, 15
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ mova [tmpq+tsq*0], m12
+ mova [tmpq+tsq*2], m14
+ dec counterd
+ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
+%if ARCH_X86_32
+ mov tmpm, tmpd
+ mov r0, [esp+0x100]
+ mov r1, [esp+0x104]
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %define alphaq r0
+ %define alphad r0
+ %define alpham [esp+gprsize+0x100]
+ %define betaq r1
+ %define betad r1
+ %define betam [esp+gprsize+0x104]
+ %define deltaq r0
+ %define deltad r0
+ %define deltam [esp+gprsize+0x108]
+ %define gammaq r1
+ %define gammad r1
+ %define gammam [esp+gprsize+0x10C]
+ %define filterq r3
+ %define tmp1q r4
+ %define tmp1d r4
+ %define tmp1m [esp+gprsize+0x110]
+ %define myq r5
+ %define myd r5
+ %define mym r6m
+ %if copy_args
+ %define dstm [esp+stack_size-4*1]
+ %define dsm [esp+stack_size-4*2]
+ %define srcm [esp+stack_size-4*3]
+ %define ssm [esp+stack_size-4*4]
+ %define mxm [esp+stack_size-4*5]
+ %define mym [esp+stack_size-4*6]
+ %endif
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_32
+ mov dstm, dstd
+ mov alphad, [esp+0x100]
+ mov betad, [esp+0x104]
+%endif
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+%if notcpuflag(sse4)
+ %if cpuflag(ssse3)
+ %define roundval pw_8192
+ %else
+ %define roundval pd_262144
+ %endif
+ %if ARCH_X86_64
+ mova m10, [PIC_sym(roundval)]
+ %else
+ %define m10 [PIC_sym(roundval)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+ %define m11 m4
+ pxor m11, m11
+ %endif
+ psrad m12, 18
+ psrad m13, 18
+ packusdw m12, m13
+ pavgw m12, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m12, 17
+ psrad m13, 17
+ packssdw m12, m13
+ pmulhrsw m12, m10
+ %else
+ paddd m12, m10
+ paddd m13, m10
+ psrad m12, 19
+ psrad m13, 19
+ packssdw m12, m13
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+ psrad m14, 18
+ psrad m15, 18
+ packusdw m14, m15
+ pavgw m14, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m14, 17
+ psrad m15, 17
+ packssdw m14, m15
+ pmulhrsw m14, m10
+ %else
+ paddd m14, m10
+ paddd m15, m10
+ psrad m14, 19
+ psrad m15, 19
+ packssdw m14, m15
+ %endif
+%endif
+ packuswb m12, m14
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ dec counterd
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+ %assign stack_offset stack_offset-4
+ %endif
+ RELOC_ARGS
+ LEA PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+ mov abcdd, abcdm
+ %if copy_args == 0
+ mov ssd, ssm
+ mov mxd, mxm
+ %endif
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+ SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+ mov abcdd, abcdm
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%endif
+ sub betad, tmp2d ; beta -= alpha*3
+ lea filterq, [PIC_sym(mc_warp_filter2)]
+%if ARCH_X86_64
+ mov myd, r6m
+ %if cpuflag(ssse3)
+ pxor m11, m11
+ %endif
+%endif
+ call .h
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova [esp+gprsize+0x00], m2
+ %endif
+ mova [esp+gprsize+0x10], m3
+%endif
+ call .h
+ psrld m4, m0, 16
+ psrld m5, m1, 16
+%if ARCH_X86_32
+ mova [esp+gprsize+0x20], m4
+ mova [esp+gprsize+0x30], m5
+%endif
+ call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+ pcmpeqd m10, m10
+ pslld m10, 16
+ mova blendmask, m10
+ BLENDHWDW m2, m0 ; 0
+ BLENDHWDW m3, m1 ; 2
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 1
+ BLENDHWDW m5, m1 ; 3
+ mova [rsp+gprsize+0x20], m4
+ mova [rsp+gprsize+0x30], m5
+ call .h
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+ psrld m6, m2, 16
+ psrld m7, m3, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 2
+ BLENDHWDW m7, m1 ; 4
+ mova [rsp+gprsize+0x40], m6
+ mova [rsp+gprsize+0x50], m7
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ psrld m2, m4, 16
+ psrld m3, m5, 16
+ mova m10, blendmask
+ BLENDHWDW m2, m0 ; 3
+ BLENDHWDW m3, m1 ; 5
+ mova [rsp+gprsize+0x60], m2
+ mova [rsp+gprsize+0x70], m3
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x40]
+ mova m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+ psrld m4, m6, 16
+ psrld m5, m7, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 4
+ BLENDHWDW m5, m1 ; 6
+%if ARCH_X86_64
+ add myd, 512+(64<<10)
+ mova m6, m2
+ mova m7, m3
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ add dword mym, 512+(64<<10)
+%endif
+ mov counterd, 4
+ SAVE_ALPHA_BETA
+.main2:
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x60]
+ mova m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+ psrld m6, 16
+ psrld m7, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 5
+ BLENDHWDW m7, m1 ; 7
+%if ARCH_X86_64
+ WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5, \
+ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7
+%else
+ mova [esp+gprsize+0xA0], m6
+ mova [esp+gprsize+0xB0], m7
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+ LOAD_ALPHA_BETA_MX
+%endif
+ call .h
+ mova m2, [rsp+gprsize+0x40]
+ mova m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+ mova m4, [rsp+gprsize+0x80]
+ mova m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ mova [rsp+gprsize+0x40], m4
+ mova [rsp+gprsize+0x50], m5
+ psrld m4, 16
+ psrld m5, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 6
+ BLENDHWDW m5, m1 ; 8
+%if ARCH_X86_64
+ WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7, \
+ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90]
+ mov mym, myd
+ mov dstd, dstm
+ mov dsd, dsm
+ mov mxd, mxm
+%endif
+ mova m2, [rsp+gprsize+0x60]
+ mova m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0xA0]
+ mova m7, [esp+gprsize+0xB0]
+%endif
+ mova [rsp+gprsize+0x20], m2
+ mova [rsp+gprsize+0x30], m3
+ mova [rsp+gprsize+0x60], m6
+ mova [rsp+gprsize+0x70], m7
+ ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8 m3
+ %define m9 m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movu m10, [srcq]
+%if ARCH_X86_32
+ add srcd, ssm
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%else
+ add srcq, ssq
+%endif
+ shr mxd, 10
+ shr tmp1d, 10
+ movq m1, [filterq+mxq *8] ; 0 X
+ movq m8, [filterq+tmp1q*8] ; 4 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m1, [filterq+tmp2q*8] ; 0 1
+ movhps m8, [filterq+tmp1q*8] ; 4 5
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+%if cpuflag(ssse3)
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
+ pmaddubsw m0, m1
+ pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
+ pmaddubsw m1, m8
+ pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
+ pmaddubsw m15, m14
+ pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
+ pmaddubsw m10, m9
+ phaddw m0, m15
+ phaddw m1, m10
+%else
+ %if ARCH_X86_32
+ %define m11 m2
+ %endif
+ pcmpeqw m0, m0
+ psrlw m14, m0, 8
+ psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
+ pand m14, m10 ; 00 02 04 06 08 10 12 14
+ packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
+ psrldq m9, m0, 4
+ pshufd m0, m14, q0220
+ pand m0, m9
+ psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
+ pslldq m15, m14, 12
+ por m0, m15 ; shufA
+ psrlw m15, m0, 8
+ psraw m11, m1, 8
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m0, 8
+ psraw m1, 8
+ pmullw m15, m11
+ pmullw m0, m1
+ paddw m0, m15 ; pmaddubsw m0, m1
+ pshufd m15, m14, q0220
+ pand m15, m9
+ psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
+ pslldq m1, m14, 12
+ por m15, m1 ; shufC
+ pshufd m1, m14, q0220
+ pand m1, m9
+ psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
+ pslldq m11, m14, 12
+ por m1, m11 ; shufB
+ pshufd m10, m14, q0220
+ pand m10, m9
+ psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
+ pslldq m14, m14, 12
+ por m10, m14 ; shufD
+ psrlw m9, m1, 8
+ psraw m11, m8, 8
+ psllw m1, 8
+ psllw m8, 8
+ psrlw m1, 8
+ psraw m8, 8
+ pmullw m9, m11
+ pmullw m1, m8
+ paddw m1, m9 ; pmaddubsw m1, m8
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ psrlw m8, m15, 8
+ psraw m11, m14, 8
+ psllw m15, 8
+ psllw m14, 8
+ psrlw m15, 8
+ psraw m14, 8
+ pmullw m8, m11
+ pmullw m15, m14
+ paddw m15, m8 ; pmaddubsw m15, m14
+ psrlw m8, m10, 8
+ psraw m11, m9, 8
+ psllw m10, 8
+ psllw m9, 8
+ psrlw m10, 8
+ psraw m9, 8
+ pmullw m8, m11
+ pmullw m10, m9
+ paddw m10, m8 ; pmaddubsw m10, m9
+ pslld m8, m0, 16
+ pslld m9, m1, 16
+ pslld m14, m15, 16
+ pslld m11, m10, 16
+ paddw m0, m8
+ paddw m1, m9
+ paddw m15, m14
+ paddw m10, m11
+ psrad m0, 16
+ psrad m1, 16
+ psrad m15, 16
+ psrad m10, 16
+ packssdw m0, m15 ; phaddw m0, m15
+ packssdw m1, m10 ; phaddw m1, m10
+%endif
+ mova m14, [PIC_sym(pw_8192)]
+ mova m9, [PIC_sym(pd_32768)]
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ pmaddwd m1, m14
+ paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
+ paddd m1, m9
+ ret
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+ movd [dstq ], m0 ; copy dw[0]
+ pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
+ movd [dstq+strideq*1], m1 ; copy dw[1]
+ punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
+ movd [dstq+strideq*2], m0 ; dw[2]
+ psrlq m0, 32 ; shift right in dw[3]
+ movd [dstq+stride3q ], m0 ; copy
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq]
+.w16:
+ mova [dstq ], m0
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq]
+.w32:
+ mova [dstq ], m0
+ %1 2
+ mova [dstq + 16 ], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 8
+ %1 0
+ add dstq, strideq
+.w64:
+ %assign i 0
+ %rep 4
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 4
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1_INC_PTR 16
+ %1 0
+ add dstq, strideq
+.w128:
+ %assign i 0
+ %rep 8
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 8
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+ mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+ paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm ; leading zeros
+ movifnidn hd, hm ; move h(stack) to h(register) if not already that register
+ movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+ mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m2, [tmp1q+(%1+0)*mmsize]
+ mova m0, m2
+ psubw m2, [tmp2q+(%1+0)*mmsize]
+ mova m3, [tmp1q+(%1+1)*mmsize]
+ mova m1, m3
+ psubw m3, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ movd m4, r6m
+ movifnidn hd, hm
+ pxor m0, m0
+ movsxd wq, dword [r6+wq*4]
+ mova m5, [pw_2048+r6-w_avg_ssse3_table]
+ pshufb m4, m0
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ psubw m0, m4
+ mov tmp1q, tmp2q
+ mova m4, m0 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ mova m3, [maskq+(%1+0)*(mmsize/2)]
+ mova m0, [tmp2q+(%1+0)*mmsize] ; b
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+ mova m6, m3 ; m
+ psubb m3, m4, m6 ; -m
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3 ; -m << 1
+ punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+ pmulhw m1, m2 ; (-m * (b - a)) << 10
+ paddw m0, m1 ; + b
+ mova m1, [tmp2q+(%1+1)*mmsize] ; b
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+ paddw m2, m2 ; (b - a) << 1
+ mova m6, m3 ; (-m << 1)
+ punpckhbw m3, m4, m6 ; (-m << 9)
+ pmulhw m2, m3 ; (-m << 9)
+ paddw m1, m2 ; (-m * (b - a)) << 10
+ pmulhrsw m0, m5 ; round
+ pmulhrsw m1, m5 ; round
+ packuswb m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*mmsize/2
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+%if ARCH_X86_64
+cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ movifnidn hd, hm
+%else
+cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+%define hd dword r5m
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+ mova m5, [base+pw_2048]
+ add wq, r6
+ mov maskq, r6m
+ BIDIR_FN MASK
+%undef hd
+
+%macro W_MASK_420_END 1-*
+%rep %0
+ call .main
+ paddw m2, [maskq+16*%1]
+ mova [maskq+16*%1], m2
+ mova [dstq+strideq*1+16*(2*%1+0)], m0
+ call .main
+ psubw m3, m7, m2
+ psubw m1, m7, [maskq+16*%1]
+ psubw m3, [dstq+strideq*1+16*(2*%1+1)]
+ psrlw m1, 2
+ psrlw m3, 2
+ packuswb m1, m3
+ mova [maskq+16*%1], m1
+ mova [dstq+strideq*1+16*(2*%1+1)], m0
+ %rotate 1
+%endrep
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_2048]
+ movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ movifnidn hd, hm
+%else
+ %define m8 [base+pw_6903]
+ %define hd dword hm
+%endif
+ mov maskq, maskmp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ add maskq, 4
+ lea dstq, [dstq+strideq*2]
+.w4:
+ pshufd m3, m2, q2020
+ pshufd m2, m2, q3131
+ psubw m1, m7, m3
+ psubw m1, m2
+ psrlw m1, 2
+ packuswb m1, m1
+ movd [maskq], m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ add maskq, 4
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movhlps m3, m2
+ psubw m1, m7, m2
+ psubw m1, m3
+ psrlw m1, 2
+ packuswb m1, m1
+ movd [maskq], m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add maskq, 8
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*0], m0
+ call .main
+ psubw m1, m7, [dstq+strideq*1]
+ psubw m1, m2
+ psrlw m1, 2
+ packuswb m1, m1
+ movq [maskq], m1
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [maskq], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ W_MASK_420_END 0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add maskq, 16*2
+ lea dstq, [dstq+strideq*2]
+.w64:
+ mova [maskq+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ call .main
+ mova [maskq+16*1], m2
+ mova [dstq+strideq*0+16*2], m0
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m0
+ W_MASK_420_END 0, 1
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add maskq, 16*4
+ lea dstq, [dstq+strideq*2]
+.w128:
+ mova [maskq+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ call .main
+ mova [maskq+16*1], m2
+ mova [dstq+strideq*0+16*2], m0
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m0
+ call .main
+ mova [maskq+16*2], m2
+ mova [dstq+strideq*0+16*4], m0
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*5], m0
+ call .main
+ mova [maskq+16*3], m2
+ mova [dstq+strideq*0+16*6], m0
+ call .main
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m0
+ W_MASK_420_END 0, 1, 2, 3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m0, [tmp1q +16*0]
+ mova m3, [tmp1q+tmp2q+16*0]
+ mova m1, [tmp1q +16*1]
+ mova m4, [tmp1q+tmp2q+16*1]
+ add tmp1q, 16*2
+ psubw m3, m0
+ psubw m4, m1
+ pabsw m5, m3
+ psubusw m2, m8, m5
+ psrlw m2, 8 ; 64 - m
+ psllw m5, m2, 10
+ pmulhw m3, m5
+ pabsw m5, m4
+ paddw m0, m3
+ psubusw m3, m8, m5
+ psrlw m3, 8
+ phaddw m2, m3
+ psllw m3, 10
+ pmulhw m4, m3
+ paddw m1, m4
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ ret
+
+%macro W_MASK_422_BACKUP 1 ; mask_offset
+%if ARCH_X86_64
+ mova m10, m2
+%else
+ mova [maskq+16*%1], m2
+%endif
+%endmacro
+
+%macro W_MASK_422_END 1 ; mask_offset
+%if ARCH_X86_64
+ packuswb m10, m2
+ psubb m1, m7, m10
+ pavgb m1, m9
+%else
+ mova m3, [maskq+16*%1]
+ packuswb m3, m2
+ pxor m2, m2
+ psubb m1, m7, m3
+ pavgb m1, m2
+%endif
+ mova [maskq+16*%1], m1
+%endmacro
+
+cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_2048]
+ movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pw_6903]
+ pxor m9, m9
+ movifnidn hd, hm
+%else
+ add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
+ %define hd dword hm
+%endif
+ mov maskq, maskmp
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ jmp wq
+.w4_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 8
+ lea dstq, [dstq+strideq*2]
+.w4:
+ packuswb m2, m2
+ psubb m1, m7, m2
+%if ARCH_X86_64
+ pavgb m1, m9
+%else
+ pxor m2, m2
+ pavgb m1, m2
+%endif
+ movq [maskq], m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w8:
+ W_MASK_422_BACKUP 0
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ lea dstq, [dstq+strideq*2]
+ W_MASK_422_END 0
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w16:
+ W_MASK_422_BACKUP 0
+ mova [dstq+strideq*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ add dstq, strideq
+.w32:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16*2
+ add dstq, strideq
+.w64:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 1
+ mova [dstq+16*2], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 1
+ mova [dstq+16*3], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16*4
+ add dstq, strideq
+.w128:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 1
+ mova [dstq+16*2], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 1
+ mova [dstq+16*3], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 2
+ mova [dstq+16*4], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 2
+ mova [dstq+16*5], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 3
+ mova [dstq+16*6], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 3
+ mova [dstq+16*7], m0
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov maskq, maskmp
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_6903]
+ mova m7, [base+pw_2048]
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pb_64]
+ movifnidn hd, hm
+%else
+ %define m8 [base+pb_64]
+ %define hd dword hm
+%endif
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ call .main
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ call .main
+ mova [dstq+16*2], m0
+ call .main
+ mova [dstq+16*3], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ call .main
+ mova [dstq+16*2], m0
+ call .main
+ mova [dstq+16*3], m0
+ call .main
+ mova [dstq+16*4], m0
+ call .main
+ mova [dstq+16*5], m0
+ call .main
+ mova [dstq+16*6], m0
+ call .main
+ mova [dstq+16*7], m0
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m0, [tmp1q +16*0]
+ mova m3, [tmp1q+tmp2q+16*0]
+ mova m1, [tmp1q +16*1]
+ mova m4, [tmp1q+tmp2q+16*1]
+ add tmp1q, 16*2
+ psubw m3, m0
+ psubw m4, m1
+ pabsw m5, m3
+ psubusw m2, m6, m5
+ psrlw m2, 8 ; 64 - m
+ psllw m5, m2, 10
+ pmulhw m3, m5
+ pabsw m5, m4
+ paddw m0, m3
+ psubusw m3, m6, m5
+ psrlw m3, 8
+ packuswb m2, m3
+ psllw m3, 10
+ pmulhw m4, m3
+ psubb m3, m8, m2
+ paddw m1, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ mova [maskq], m3
+ add maskq, 16
+ packuswb m0, m1
+ ret
+
+%macro BLEND_64M 4; a, b, mask1, mask2
+ punpcklbw m0, %1, %2; {b;a}[7..0]
+ punpckhbw %1, %2 ; {b;a}[15..8]
+ pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16
+ pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
+ packuswb m0, %1 ; {blendpx}[15..0] u8
+%endmacro
+
+%macro BLEND 2; a, b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpckhbw m3, m0 ; {m;(64-m)}[15..8]
+ BLEND_64M %1, %2, m2, m3
+%endmacro
+
+cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+ movsxd wq, dword [r6+wq*4]
+ mova m4, [base+pb_64]
+ mova m5, [base+pw_512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movq m0, [maskq]; m
+ movd m1, [dstq+dsq*0] ; a
+ movd m6, [dstq+dsq*1]
+ punpckldq m1, m6
+ movq m6, [tmpq] ; b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpcklbw m1, m6 ; {b;a}[7..0]
+ pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ packuswb m1, m0 ; {blendpx}[15..0] u8
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add maskq, 8
+ add tmpq, 8
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w4
+ RET
+.w8:
+ mova m0, [maskq]; m
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add maskq, 16
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [maskq]; m
+ mova m1, [dstq] ; a
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ mova [dstq], m0
+ add maskq, 16
+ add tmpq, 16
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w16
+ RET
+.w32:
+ %assign i 0
+ %rep 2
+ mova m0, [maskq+16*i]; m
+ mova m1, [dstq+16*i] ; a
+ mova m6, [tmpq+16*i] ; b
+ BLEND m1, m6
+ mova [dstq+i*16], m0
+ %assign i i+1
+ %endrep
+ add maskq, 32
+ add tmpq, 32
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ mova m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_ssse3_table
+ jmp wq
+.w2:
+ movd m3, [maskq+4]
+ punpckldq m3, m3
+ ; 2 mask blend is provided for 4 pixels / 2 lines
+.w2_loop:
+ movd m1, [dstq+dsq*0] ; a {..;a;a}
+ pinsrw m1, [dstq+dsq*1], 1
+ movd m2, [tmpq] ; b
+ punpcklbw m0, m1, m2; {b;a}[7..0]
+ pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m0, m1 ; {blendpx}[8..0] u8
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ add tmpq, 2*2
+ lea dstq, [dstq + dsq * 2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m3, [maskq+8]
+ ; 4 mask blend is provided for 8 pixels / 2 lines
+.w4_loop:
+ movd m1, [dstq+dsq*0] ; a
+ movd m2, [dstq+dsq*1] ;
+ punpckldq m1, m2
+ movq m2, [tmpq] ; b
+ punpcklbw m1, m2 ; {b;a}[7..0]
+ pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m1, m1 ; {blendpx}[8..0] u8
+ movd [dstq], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add tmpq, 2*4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m3, [maskq+16]
+ ; 8 mask blend is provided for 16 pixels
+.w8_loop:
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]; b
+ BLEND_64M m1, m2, m3, m3
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ ; 16 mask blend is provided for 32 pixels
+ mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
+ mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m4
+ mova [dstq], m0
+ add tmpq, 16
+ add dstq, dsq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ mova [rsp+8], xmm6
+%endif
+ mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+ mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+ mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
+ ; 16 mask blend is provided for 64 pixels
+.w32_loop:
+ mova m1, [dstq+16*0] ; a
+ mova m2, [tmpq+16*0] ; b
+ BLEND_64M m1, m2, m3, m4
+ movq m1, [dstq+16*1] ; a
+ punpcklbw m1, [tmpq+16*1] ; b
+ pmaddubsw m1, m6
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ mova [dstq+16*0], m0
+ movq [dstq+16*1], m1
+ add tmpq, 32
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova xmm6, [rsp+8]
+%endif
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+ ; We need to keep the PIC pointer for w4, reload wd from stack instead
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 5
+ mov r6d, wd
+%endif
+ LEA t0, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, dword [t0+wq*4]
+ mova m5, [base+pw_512]
+ add wq, t0
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrw m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movd m1, [tmpq]
+ punpcklwd m2, m2
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 2*2
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+%if ARCH_X86_32
+ mova m3, [base+blend_shuf]
+%else
+ mova m3, [blend_shuf]
+%endif
+.w4_loop:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ punpckldq m0, m2 ; a
+ movq m1, [tmpq] ; b
+ movq m2, [maskq+hq*2] ; m
+ pshufb m2, m3
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 4*2
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movd m4, [maskq+hq*2]
+ punpcklwd m4, m4
+ pshufd m3, m4, q0000
+ pshufd m4, m4, q1111
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]
+ BLEND_64M m1, m2, m3, m4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 8*2
+ add hq, 2
+ jl .w8
+ RET
+; w16/w32/w64/w128
+.w16:
+%if ARCH_X86_32
+ mov r6d, wm
+%endif
+ sub dsq, r6
+.w16_loop0:
+ movd m3, [maskq+hq*2]
+ pshuflw m3, m3, q0000
+ punpcklqdq m3, m3
+ mov wd, r6d
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m3
+ mova [dstq], m0
+ add dstq, 16
+ add tmpq, 16
+ sub wd, 16
+ jg .w16_loop
+ add dstq, dsq
+ inc hq
+ jl .w16_loop0
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+ pxor m1, m1
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ add reg_src, reg_tmp
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, mmsize
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3]
+ %endif
+%if %1
+ movu [reg_tmp+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, mmsize
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ add reg_tmp, centerwq
+%else
+ lea reg_tmp, [dstq+centerwq]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq-1]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq-1]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3], m0
+ add r3, mmsize
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1]
+ lea r3, [dstq+r1]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1]
+%endif
+ lea r3, [dstq+r1]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%else
+cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%define hd dword r5m
+%if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+%define base r6-$$
+%else
+ LEA r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+ mova m10, [base+pw_m256]
+ mova m9, [base+pd_63]
+ mova m8, [base+pb_8x0_8x8]
+%else
+%define m10 [base+pw_m256]
+%define m9 [base+pd_63]
+%define m8 [base+pb_8x0_8x8]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 13, 0
+ SCRATCH 6, 12, 1
+ SCRATCH 5, 11, 2
+
+ ; m10 = pmulhrsw constant for x=(x+64)>>7
+ ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m0, m12 ; per-line working version of mx
+
+.loop_x:
+ pxor m1, m1
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m11, m1
+ pand m1, m3
+ pandn m3, m11
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m9 ; filter offset (masked)
+
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movq m4, [srcq+r8]
+ movq m5, [srcq+r10]
+ movhps m4, [srcq+r9]
+ movhps m5, [srcq+r11]
+%else
+ movd r3d, m1
+ pshufd m1, m1, q3312
+ movd r1d, m1
+ pshuflw m1, m1, q3232
+ movq m4, [srcq+r3]
+ movq m5, [srcq+r1]
+ movd r3d, m1
+ punpckhqdq m1, m1
+ movd r1d, m1
+ movhps m4, [srcq+r3]
+ movhps m5, [srcq+r1]
+%endif
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ pxor m6, m6
+ pcmpeqb m6, m3
+%if ARCH_X86_64
+ pmovmskb r8d, m6
+ cmp r8d, 0xffff
+%else
+ pmovmskb r3d, m6
+ cmp r3d, 0xffff
+%endif
+ je .filter
+
+%if ARCH_X86_64
+ movd r8d, m3
+ pshuflw m3, m3, q3232
+ movd r9d, m3
+ punpckhqdq m3, m3
+ movd r10d, m3
+ psrlq m3, 32
+ movd r11d, m3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ movq m6, [base+resize_shuf+4+r8]
+ movq m7, [base+resize_shuf+4+r10]
+ movhps m6, [base+resize_shuf+4+r9]
+ movhps m7, [base+resize_shuf+4+r11]
+%else
+ movd r3d, m3
+ pshufd m3, m3, q3312
+ movd r1d, m3
+ pshuflw m3, m3, q3232
+ movq m6, [base+resize_shuf+4+r3]
+ movq m7, [base+resize_shuf+4+r1]
+ movd r3d, m3
+ punpckhqdq m3, m3
+ movd r1d, m3
+ movhps m6, [base+resize_shuf+4+r3]
+ movhps m7, [base+resize_shuf+4+r1]
+%endif
+
+ paddb m6, m8
+ paddb m7, m8
+ pshufb m4, m6
+ pshufb m5, m7
+
+.filter:
+%if ARCH_X86_64
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m6, [base+resize_filter+r8*8]
+ movq m7, [base+resize_filter+r10*8]
+ movhps m6, [base+resize_filter+r9*8]
+ movhps m7, [base+resize_filter+r11*8]
+%else
+ movd r3d, m2
+ pshufd m2, m2, q3312
+ movd r1d, m2
+ pshuflw m2, m2, q3232
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ movd r3d, m2
+ punpckhqdq m2, m2
+ movd r1d, m2
+ movhps m6, [base+resize_filter+r3*8]
+ movhps m7, [base+resize_filter+r1*8]
+%endif
+
+ pmaddubsw m4, m6
+ pmaddubsw m5, m7
+ phaddw m4, m5
+ phaddsw m4, m4
+ pmulhrsw m4, m10 ; x=(x+64)>>7
+ packuswb m4, m4
+ movd [dstq+xq], m4
+
+ paddd m0, m13
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET
+
+INIT_XMM ssse3
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm
new file mode 100644
index 0000000000..9f05c921a6
--- /dev/null
+++ b/third_party/dav1d/src/x86/msac.asm
@@ -0,0 +1,667 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64 ; avoids cacheline splits
+
+min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+pw_0xff00: times 8 dw 0xff00
+pw_32: times 8 dw 32
+
+%if ARCH_X86_64
+%define resp resq
+%define movp movq
+%define c_shuf q3333
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1
+%endmacro
+%else
+%define resp resd
+%define movp movd
+%define c_shuf q1111
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
+ mov t0, r0m
+ mov t1, r1m
+%if %1 == 0
+ mov t2, r2m
+%endif
+%if STACK_ALIGNMENT >= 16
+ sub esp, 40-%1*4
+%else
+ mov eax, esp
+ and esp, ~15
+ sub esp, 40-%1*4
+ mov [esp], eax
+%endif
+%endmacro
+%endif
+
+struc msac
+ .buf: resp 1
+ .end: resp 1
+ .dif: resp 1
+ .rng: resd 1
+ .cnt: resd 1
+ .update_cdf: resd 1
+endstruc
+
+%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
+%define buf rsp+stack_offset+8 ; shadow space
+%elif UNIX64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
+%define buf rsp-40 ; red zone
+%else
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
+%define buf esp+8
+%endif
+
+INIT_XMM sse2
+cglobal msac_decode_symbol_adapt4, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ movq m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2 ; -(n_symbols + 1)
+ pshuflw m2, m2, q0000
+ movd [buf+12], m2
+ pand m2, [rax]
+ mova m0, m1
+ psrlw m1, 6
+ psllw m1, 7
+ pmulhuw m1, m2
+ movq m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
+ paddw m1, m2
+ mova [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2 ; c >= v
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz .renorm ; !allow_update_cdf
+
+; update_cdf:
+ movzx t3d, word [t1+t4*2] ; count
+ pcmpeqw m2, m2
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 3
+ sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
+ cmp t2d, 32
+ adc t2d, 0 ; count + (count < 32)
+ movd m3, t3d
+ pavgw m2, m1 ; i >= val ? -1 : 32768
+ psubw m2, m0 ; for (i = 0; i < val; i++)
+ psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
+ psraw m2, m3 ; for (; i < n_symbols; i++)
+ paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
+ movq [t1], m0
+ mov [t1+t4*2], t2w
+
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax+16] ; v
+ movzx t2d, word [buf+rax+14] ; u
+ shr eax, 1
+.renorm2:
+%if ARCH_X86_64 == 0
+%if STACK_ALIGNMENT >= 16
+ add esp, 40
+%else
+ mov esp, [esp]
+%endif
+%endif
+ not t4
+ sub t2d, t1d ; rng
+ shl t1, gprsize*8-16
+ add t4, t1 ; ~dif
+.renorm3:
+ mov t1d, [t0+msac.cnt]
+ movifnidn t7, t0
+.renorm4:
+ bsr ecx, t2d
+ xor ecx, 15 ; d
+.renorm5:
+ shl t2d, cl
+ shl t4, cl
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t1d, ecx
+ jae .end ; no refill required
+
+; refill:
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if ARCH_X86_64 == 0
+ push t5
+%endif
+ lea t5, [t2+gprsize]
+ cmp t5, rcx
+ ja .refill_eob
+ mov t2, [t2]
+ lea ecx, [t1+23]
+ add t1d, 16
+ shr ecx, 3 ; shift_bytes
+ bswap t2
+ sub t5, rcx
+ shl ecx, 3 ; shift_bits
+ shr t2, cl
+ sub ecx, t1d ; shift_bits - 16 - cnt
+ mov t1d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t5
+ sub t1d, ecx ; cnt + gprsize*8 - shift_bits
+ xor t4, t2
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+.end:
+ mov [t7+msac.cnt], t1d
+ mov [t7+msac.dif], t4
+ RET
+.refill_eob: ; avoid overreading the input buffer
+ mov t5, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t1d ; c
+.refill_eob_loop:
+ cmp t2, t5
+ jae .refill_eob_end ; eob reached
+ movzx t1d, byte [t2]
+ inc t2
+ shl t1, cl
+ xor t4, t1
+ sub ecx, 8
+ jge .refill_eob_loop
+.refill_eob_end:
+ mov t1d, gprsize*8-24
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+ sub t1d, ecx
+ mov [t7+msac.buf], t2
+ mov [t7+msac.dif], t4
+ mov [t7+msac.cnt], t1d
+ RET
+
+cglobal msac_decode_symbol_adapt8, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ mova m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+ pshuflw m2, m2, q0000
+ movd [buf+12], m2
+ punpcklqdq m2, m2
+ mova m0, m1
+ psrlw m1, 6
+ pand m2, [rax]
+ psllw m1, 7
+ pmulhuw m1, m2
+ movu m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
+ paddw m1, m2
+ punpcklqdq m3, m3
+ mova [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m2, m2
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 3 ; may be called with n_symbols <= 2
+ sbb t3d, -5
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, t3d
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, m3
+ paddw m0, m2
+ mova [t1], m0
+ mov [t1+t4*2], t2w
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
+
+cglobal msac_decode_symbol_adapt16, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m4, [t0+msac.rng]
+ mova m2, [t1]
+ mova m3, [t1+16]
+ movp m5, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+%if WIN64
+ sub rsp, 48 ; need 36 bytes, shadow space is only 32
+%endif
+ pshuflw m4, m4, q0000
+ movd [buf-4], m4
+ punpcklqdq m4, m4
+ mova m0, m2
+ psrlw m2, 6
+ mova m1, m3
+ psrlw m3, 6
+ pand m4, [rax]
+ psllw m2, 7
+ psllw m3, 7
+ pmulhuw m2, m4
+ pmulhuw m3, m4
+ movu m4, [rax+t2*2]
+ pshuflw m5, m5, c_shuf
+ paddw m2, m4
+ psubw m4, [rax-pw_0xff00+pw_32]
+ punpcklqdq m5, m5
+ paddw m3, m4
+ mova [buf], m2
+ psubusw m2, m5
+ mova [buf+16], m3
+ psubusw m3, m5
+ pxor m4, m4
+ pcmpeqw m2, m4
+ pcmpeqw m3, m4
+ packsswb m5, m2, m3
+ pmovmskb eax, m5
+ test t3d, t3d
+ jz .renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m4, m4
+ mova m5, m4
+ lea t2d, [t3+80] ; only support n_symbols > 2
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
+ pavgw m4, m2
+ pavgw m5, m3
+ psubw m4, m0
+ psubw m0, m2
+ movd m2, t2d
+ psubw m5, m1
+ psubw m1, m3
+ psraw m4, m2
+ psraw m5, m2
+ paddw m0, m4
+ paddw m1, m5
+ mova [t1], m0
+ mova [t1+16], m1
+ mov [t1+t4*2], t3w
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax*2]
+ movzx t2d, word [buf+rax*2-2]
+%if WIN64
+ add rsp, 48
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
+
+cglobal msac_decode_bool_adapt, 0, 6, 0
+ movifnidn t1, r1mp
+ movifnidn t0, r0mp
+ movzx eax, word [t1]
+ movzx t3d, byte [t0+msac.rng+1]
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+%if ARCH_X86_64
+ mov t5d, eax
+%endif
+ and eax, ~63
+ imul eax, t3d
+%if UNIX64
+ mov t6, t4
+%endif
+ shr eax, 7
+ add eax, 4 ; v
+ mov t3d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t3d ; r - v
+ sub t4, rax ; dif - vw
+ setb al
+ cmovb t2d, t3d
+ mov t3d, [t0+msac.update_cdf]
+%if UNIX64
+ cmovb t4, t6
+%else
+ cmovb t4, [t0+msac.dif]
+%endif
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ not t4
+ test t3d, t3d
+ jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%if UNIX64 == 0
+ push t6
+%endif
+ movzx t6d, word [t1+2]
+%if ARCH_X86_64 == 0
+ push t5
+ movzx t5d, word [t1]
+%endif
+ movifnidn t7, t0
+ lea ecx, [t6+64]
+ cmp t6d, 32
+ adc t6d, 0
+ mov [t1+2], t6w
+ imul t6d, eax, -32769
+ shr ecx, 4 ; rate
+ add t6d, t5d ; if (bit)
+ sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
+ sar t6d, cl ; else
+ sub t5d, t6d ; cdf[0] -= cdf[0] >> rate;
+ mov [t1], t5w
+%if WIN64
+ mov t1d, [t7+msac.cnt]
+ pop t6
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
+%else
+%if ARCH_X86_64 == 0
+ pop t5
+ pop t6
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%endif
+
+cglobal msac_decode_bool_equi, 0, 6, 0
+ movifnidn t0, r0mp
+ mov t1d, [t0+msac.rng]
+ mov t4, [t0+msac.dif]
+ mov t2d, t1d
+ mov t1b, 8
+ mov t3, t4
+ mov eax, t1d
+ shr t1d, 1 ; v
+ shl rax, gprsize*8-17 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ mov t1d, [t0+msac.cnt]
+ cmovb t4, t3
+ movifnidn t7, t0
+ mov ecx, 0xbfff
+ setb al ; the upper 32 bits contains garbage but that's OK
+ sub ecx, t2d
+ not t4
+ ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
+ ; i.e. (0 <= d <= 2) and v < (3 << 14)
+ shr ecx, 14 ; d
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
+
+cglobal msac_decode_bool, 0, 6, 0
+ movifnidn t0, r0mp
+ movifnidn t1d, r1m
+ movzx eax, byte [t0+msac.rng+1] ; r >> 8
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+ and t1d, ~63
+ imul eax, t1d
+ mov t3, t4
+ shr eax, 7
+ add eax, 4 ; v
+ mov t1d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ cmovb t4, t3
+ setb al
+ not t4
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+
+%macro HI_TOK 1 ; update_cdf
+%if ARCH_X86_64 == 0
+ mov eax, -24
+%endif
+%%loop:
+%if %1
+ movzx t2d, word [t1+3*2]
+%endif
+ mova m1, m0
+ pshuflw m2, m2, q0000
+ psrlw m1, 6
+ movd [buf+12], m2
+ pand m2, m4
+ psllw m1, 7
+ pmulhuw m1, m2
+%if ARCH_X86_64 == 0
+ add eax, 5
+ mov [buf+8], eax
+%endif
+ pshuflw m3, m3, c_shuf
+ paddw m1, m5
+ movq [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2
+ pmovmskb eax, m1
+%if %1
+ lea ecx, [t2+80]
+ pcmpeqw m2, m2
+ shr ecx, 4
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, ecx
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, m3
+ paddw m0, m2
+ movq [t1], m0
+ mov [t1+3*2], t2w
+%endif
+ tzcnt eax, eax
+ movzx ecx, word [buf+rax+16]
+ movzx t2d, word [buf+rax+14]
+ not t4
+%if ARCH_X86_64
+ add t6d, 5
+%endif
+ sub eax, 5 ; setup for merging the tok_br and tok branches
+ sub t2d, ecx
+ shl rcx, gprsize*8-16
+ add t4, rcx
+ bsr ecx, t2d
+ xor ecx, 15
+ shl t2d, cl
+ shl t4, cl
+ movd m2, t2d
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t5d, ecx
+ jae %%end
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if UNIX64 == 0
+ push t8
+%endif
+ lea t8, [t2+gprsize]
+ cmp t8, rcx
+ ja %%refill_eob
+ mov t2, [t2]
+ lea ecx, [t5+23]
+ add t5d, 16
+ shr ecx, 3
+ bswap t2
+ sub t8, rcx
+ shl ecx, 3
+ shr t2, cl
+ sub ecx, t5d
+ mov t5d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t8
+%if UNIX64 == 0
+ pop t8
+%endif
+ sub t5d, ecx
+ xor t4, t2
+%%end:
+ movp m3, t4
+%if ARCH_X86_64
+ add t6d, eax ; CF = tok_br < 3 || tok == 15
+ jnc %%loop
+ lea eax, [t6+30]
+%else
+ add eax, [buf+8]
+ jnc %%loop
+ add eax, 30
+%if STACK_ALIGNMENT >= 16
+ add esp, 36
+%else
+ mov esp, [esp]
+%endif
+%endif
+ mov [t7+msac.dif], t4
+ shr eax, 1
+ mov [t7+msac.cnt], t5d
+ RET
+%%refill_eob:
+ mov t8, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t5d
+%%refill_eob_loop:
+ cmp t2, t8
+ jae %%refill_eob_end
+ movzx t5d, byte [t2]
+ inc t2
+ shl t5, cl
+ xor t4, t5
+ sub ecx, 8
+ jge %%refill_eob_loop
+%%refill_eob_end:
+%if UNIX64 == 0
+ pop t8
+%endif
+ mov t5d, gprsize*8-24
+ mov [t7+msac.buf], t2
+ sub t5d, ecx
+ jmp %%end
+%endmacro
+
+cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
+ DECODE_SYMBOL_ADAPT_INIT 1
+%if ARCH_X86_64 == 0 && PIC
+ LEA t2, min_prob+12*2
+ %define base t2-(min_prob+12*2)
+%else
+ %define base 0
+%endif
+ movq m0, [t1]
+ movd m2, [t0+msac.rng]
+ mov eax, [t0+msac.update_cdf]
+ movq m4, [base+pw_0xff00]
+ movp m3, [t0+msac.dif]
+ movq m5, [base+min_prob+12*2]
+ mov t4, [t0+msac.dif]
+ mov t5d, [t0+msac.cnt]
+%if ARCH_X86_64
+ mov t6d, -24
+%endif
+ movifnidn t7, t0
+ test eax, eax
+ jz .no_update_cdf
+ HI_TOK 1
+.no_update_cdf:
+ HI_TOK 0
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal msac_decode_symbol_adapt16, 3, 6, 6
+ lea rax, [pw_0xff00]
+ vpbroadcastw m2, [t0+msac.rng]
+ mova m0, [t1]
+ vpbroadcastw m3, [t0+msac.dif+6]
+ vbroadcasti128 m4, [rax]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+ mov r5, rsp
+%if WIN64
+ and rsp, ~31
+ sub rsp, 40
+%else
+ and r5, ~31
+ %define buf r5-32
+%endif
+ psrlw m1, m0, 6
+ movd [buf-4], xm2
+ pand m2, m4
+ psllw m1, 7
+ pmulhuw m1, m2
+ paddw m1, [rax+t2*2]
+ mova [buf], m1
+ pmaxuw m1, m3
+ pcmpeqw m1, m3
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz .renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m2, m2
+ lea t2d, [t3+80]
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
+ movd xm3, t2d
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, xm3
+ paddw m0, m2
+ mova [t1], m0
+ mov [t1+t4*2], t3w
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax-0]
+ movzx t2d, word [buf+rax-2]
+ shr eax, 1
+%if WIN64
+ mov rsp, r5
+%endif
+ vzeroupper
+ jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
+%endif
diff --git a/third_party/dav1d/src/x86/msac.h b/third_party/dav1d/src/x86/msac.h
new file mode 100644
index 0000000000..0bb632fb31
--- /dev/null
+++ b/third_party/dav1d/src/x86/msac.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_MSAC_H
+#define DAV1D_SRC_X86_MSAC_H
+
+#include "src/cpu.h"
+
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
+unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
+
+#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
+#endif
+
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
+
+#if ARCH_X86_64
+#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+
+static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+ }
+
+ if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+ }
+}
+
+#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm
new file mode 100644
index 0000000000..06f555db11
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.asm
@@ -0,0 +1,688 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro JMP_TABLE 2-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %1_table:
+ %xdefine %%base %1_table
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
+ %rep %1
+ db %2*3
+ db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
+ mangle(private_prefix %+ _save_tmvs_%3).write1
+ %endrep
+%endmacro
+
+%if ARCH_X86_64
+splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+ db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+%endif
+save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0
+ db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1
+save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2
+ db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3
+save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1
+cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
+save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
+save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
+pb_128: times 16 db 128
+
+save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
+ SAVE_TMVS_TABLE 4, 8, ssse3
+ SAVE_TMVS_TABLE 4, 4, ssse3
+ SAVE_TMVS_TABLE 5, 2, ssse3
+ SAVE_TMVS_TABLE 7, 1, ssse3
+
+%if ARCH_X86_64
+save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
+ SAVE_TMVS_TABLE 4, 8, avx2
+ SAVE_TMVS_TABLE 4, 4, avx2
+ SAVE_TMVS_TABLE 5, 2, avx2
+ SAVE_TMVS_TABLE 7, 1, avx2
+
+save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
+ SAVE_TMVS_TABLE 4, 8, avx512icl
+ SAVE_TMVS_TABLE 4, 4, avx512icl
+ SAVE_TMVS_TABLE 5, 2, avx512icl
+ SAVE_TMVS_TABLE 7, 1, avx512icl
+
+JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
+JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
+%endif
+
+JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
+
+SECTION .text
+
+%macro movif32 2
+%if ARCH_X86_32
+ mov %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+%if ARCH_X86_64
+cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base_reg r12
+%else
+cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+ movq m5, [ref_signq]
+ lea strided, [strided*5]
+ mov stridem, strided
+ mov r3, xstartm
+ mov r1, ystartm
+ DEFINE_ARGS b, ystart, rr, cand, xend, x
+%define stridemp r1m
+%define m8 [base+pb_128]
+%define m9 [base+save_pack0+ 0]
+%define m10 [base+save_pack0+16]
+%define base_reg r6
+%endif
+%define base base_reg-.write1
+ LEA base_reg, .write1
+%if ARCH_X86_64
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ movq m5, [ref_signq]
+%endif
+ movu m4, [base+save_ref_shuf]
+ movddup m6, [base+save_cond0]
+ movddup m7, [base+save_cond1]
+%if ARCH_X86_64
+ mova m8, [base+pb_128]
+ mova m9, [base+save_pack0+ 0]
+ mova m10, [base+save_pack0+16]
+%endif
+ psllq m5, 8
+%if ARCH_X86_64
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+%else
+ lea r0, [xendd*5] ; xend5
+ lea r3, [r3*5] ; xstart5
+ sub r3, r0 ; -w5
+ mov r6m, r3
+%define xstartq r6m
+ add xendd, r0 ; xend6
+ add r0m, r0 ; rp+xend5
+ mov xendm, xendd
+ sub r5, r1 ; h
+ add r1, r1
+ mov r7m, r1
+ mov r5m, r5
+%define hd r5mp
+ jmp .loop_y_noload
+%endif
+.loop_y:
+ movif32 ystartd, r7m
+ movif32 xendd, xendm
+.loop_y_noload:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*gprsize]
+ add ystartd, 2
+ movif32 r7m, ystartd
+ lea bq, [bq+xendq*4]
+.loop_x:
+%if ARCH_X86_32
+%define rpq r3
+%define r10 r1
+%define r10d r1
+%define r11 r4
+%define r11d r4
+%endif
+ imul candq, xq, 0x9999 ; x / 5 * 3
+ sar candq, 16
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu m0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
+ add r10, base_reg
+ add candq, r11
+ jge .calc
+ movu m1, [bq+candq*8+12]
+ movzx r11d, byte [bq+candq*8+22]
+ movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
+ add r11, base_reg
+.calc:
+ movif32 rpq, r0m
+ ; ref check
+ punpckhqdq m2, m0, m1
+ pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
+ pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1]
+ ; mv check
+ punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
+ pabsw m2, m2
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ ; res
+ pcmpgtd m3, m2
+ pshufd m2, m3, q2301
+ pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ...
+ pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ...
+ por m3, m2 ; b0.shuf b1.shuf | ...
+ pxor m3, m8 ; if cond0|cond1 == 0 => zero out
+ pshufb m0, m3
+ pshufb m1, m3
+ call r10
+ jge .next_line
+ pshufd m0, m1, q3232
+ call r11
+ jl .loop_x
+.next_line:
+ add rpq, stridemp
+ movif32 r0m, rpq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ movd [rpq+xq+0], m0
+ psrlq m0, 8
+ movd [rpq+xq+1], m0
+ add xq, 5*1
+ ret
+.write2:
+ movq [rpq+xq+0], m0
+ psrlq m0, 8
+ movd [rpq+xq+6], m0
+ add xq, 5*2
+ ret
+.write4:
+ pshufb m0, m9
+ movu [rpq+xq+ 0], m0
+ psrlq m0, 8
+ movd [rpq+xq+16], m0
+ add xq, 5*4
+ ret
+.write8:
+ pshufb m2, m0, m9
+ movu [rpq+xq+ 0], m2
+ pshufb m0, m10
+ movu [rpq+xq+16], m0
+ psrldq m2, 2
+ movq [rpq+xq+32], m2
+ add xq, 5*8
+ ret
+.write16:
+ pshufb m2, m0, m9
+ movu [rpq+xq+ 0], m2
+ pshufb m0, m10
+ movu [rpq+xq+16], m0
+ shufps m2, m0, q1032
+ movu [rpq+xq+48], m2
+ shufps m2, m0, q2121
+ movu [rpq+xq+32], m2
+ shufps m0, m2, q1032
+ movu [rpq+xq+64], m0
+ add xq, 5*16
+ ret
+
+INIT_XMM sse2
+; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ mova m2, [aq]
+ LEA aq, splat_mv_sse2_table
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ movifnidn bh4d, bh4m
+ pshufd m0, m2, q0210
+ pshufd m1, m2, q1021
+ pshufd m2, m2, q2102
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-16*16], m0
+ mova [aq-16*15], m1
+ mova [aq-16*14], m2
+ mova [aq-16*13], m0
+ mova [aq-16*12], m1
+ mova [aq-16*11], m2
+ mova [aq-16*10], m0
+ mova [aq-16* 9], m1
+ mova [aq-16* 8], m2
+ mova [aq-16* 7], m0
+ mova [aq-16* 6], m1
+ mova [aq-16* 5], m2
+.w16:
+ mova [aq-16* 4], m0
+ mova [aq-16* 3], m1
+ mova [aq-16* 2], m2
+ mova [aq-16* 1], m0
+ mova [aq+16* 0], m1
+ mova [aq+16* 1], m2
+.w8:
+ mova [aq+16* 2], m0
+ mova [aq+16* 3], m1
+ mova [aq+16* 4], m2
+.w4:
+ mova [aq+16* 5], m0
+ mova [aq+16* 6], m1
+ mova [aq+16* 7], m2
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], m0
+ movq [aq+120], m1
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], m0
+ movd [aq+124], m2
+ dec bh4d
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+INIT_YMM avx2
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base r12-.write1
+ lea r12, [.write1]
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ vpbroadcastq m4, [ref_signq]
+ vpbroadcastq m3, [base+save_ref_shuf+8]
+ vpbroadcastq m5, [base+save_cond0]
+ vpbroadcastq m6, [base+save_cond1]
+ vpbroadcastd m7, [base+pb_128]
+ mova m8, [base+save_pack0]
+ mova m9, [base+save_pack1]
+ psllq m4, 8
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*8]
+ add ystartd, 2
+ lea bq, [bq+xendq*4]
+.loop_x:
+ imul candq, xq, 0x9999
+ sar candq, 16 ; x / 5 * 3
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu xm0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
+ add r10, r12
+ add candq, r11
+ jge .calc
+ vinserti128 m0, [bq+candq*8+12], 1
+ movzx r11d, byte [bq+candq*8+22]
+ movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
+ add r11, r12
+.calc:
+ pshufb m1, m0, m3
+ pabsw m2, m0
+ pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ pcmpgtd m1, m2
+ pshufd m2, m1, q2301
+ pand m1, m5 ; b0.cond0 b1.cond0
+ pand m2, m6 ; b0.cond1 b1.cond1
+ por m1, m2 ; b0.shuf b1.shuf
+ pxor m1, m7 ; if cond0|cond1 == 0 => zero out
+ pshufb m0, m1
+ call r10
+ jge .next_line
+ vextracti128 xm0, m0, 1
+ call r11
+ jl .loop_x
+.next_line:
+ add rpq, strideq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ movd [rpq+xq+ 0], xm0
+ pextrb [rpq+xq+ 4], xm0, 4
+ add xq, 5*1
+ ret
+.write2:
+ movq [rpq+xq+0], xm0
+ psrlq xm1, xm0, 8
+ movd [rpq+xq+6], xm1
+ add xq, 5*2
+ ret
+.write4:
+ pshufb xm1, xm0, xm8
+ movu [rpq+xq+ 0], xm1
+ psrlq xm1, 8
+ movd [rpq+xq+16], xm1
+ add xq, 5*4
+ ret
+.write8:
+ vinserti128 m1, m0, xm0, 1
+ pshufb m1, m8
+ movu [rpq+xq+ 0], m1
+ psrldq xm1, 2
+ movq [rpq+xq+32], xm1
+ add xq, 5*8
+ ret
+.write16:
+ vinserti128 m1, m0, xm0, 1
+ pshufb m2, m1, m8
+ movu [rpq+xq+ 0], m2
+ pshufb m1, m9
+ movu [rpq+xq+32], m1
+ shufps xm2, xm1, q1021
+ movu [rpq+xq+64], xm2
+ add xq, 5*16
+ ret
+
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ vbroadcasti128 m0, [aq]
+ lea aq, [splat_mv_avx2_table]
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ pshufb m0, [splat_mv_shuf]
+ movifnidn bh4d, bh4m
+ pshufd m1, m0, q2102
+ pshufd m2, m0, q1021
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-32*8], m0
+ mova [aq-32*7], m1
+ mova [aq-32*6], m2
+ mova [aq-32*5], m0
+ mova [aq-32*4], m1
+ mova [aq-32*3], m2
+.w16:
+ mova [aq-32*2], m0
+ mova [aq-32*1], m1
+ mova [aq+32*0], m2
+.w8:
+ mova [aq+32*1], m0
+ mova [aq+32*2], m1
+ mova [aq+32*3], m2
+ dec bh4d
+ jg .loop
+ RET
+.w4:
+ movu [aq+ 80], m0
+ mova [aq+112], xm1
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], xm0
+ movq [aq+120], xm2
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], xm0
+ movd [aq+124], xm1
+ dec bh4d
+ jg .loop
+ RET
+
+INIT_ZMM avx512icl
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base r14-.write1
+ lea r14, [.write1]
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ psllq m4, [ref_signq]{bcstq}, 8
+ vpbroadcastq m3, [base+save_ref_shuf+8]
+ vbroadcasti32x4 m5, [base+cond_shuf512]
+ vbroadcasti32x4 m6, [base+save_cond0]
+ vpbroadcastd m7, [base+pb_128]
+ mova m8, [base+save_pack0]
+ movu xm9, [base+save_pack0+4]
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ mov r10d, 0x1f
+ kmovb k2, r10d
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*8]
+ add ystartd, 2
+ lea bq, [bq+xendq*4]
+.loop_x:
+ imul candq, xq, 0x9999
+ sar candq, 16 ; x / 5 * 3
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu xm0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
+ add r10, r14
+ add candq, r11
+ jge .calc
+ movzx r11d, byte [bq+candq*8+22]
+ vinserti32x4 ym0, [bq+candq*8+12], 1
+ movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
+ movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
+ add r11, r14
+ add candq, r12
+ jge .calc
+ movzx r12d, byte [bq+candq*8+22]
+ vinserti32x4 m0, [bq+candq*8+12], 2
+ movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
+ movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
+ add r12, r14
+ add candq, r13
+ jge .calc
+ vinserti32x4 m0, [bq+candq*8+12], 3
+ movzx r13d, byte [bq+candq*8+22]
+ movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
+ add r13, r14
+.calc:
+ pshufb m1, m0, m3
+ pabsw m2, m0
+ pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ psubd m2, m1
+ pshufb m2, m5 ; c0 c1 c1 c0
+ pand m2, m6
+ punpckhqdq m1, m2, m2
+ vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
+ pshufb m2, m0, m1
+ mova xm0, xm2
+ call r10
+ jge .next_line
+ vextracti32x4 xm0, m2, 1
+ call r11
+ jge .next_line
+ vextracti32x4 xm0, m2, 2
+ call r12
+ jge .next_line
+ vextracti32x4 xm0, m2, 3
+ call r13
+ jl .loop_x
+.next_line:
+ add rpq, strideq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ vmovdqu8 [rpq+xq]{k2}, xm0
+ add xq, 5*1
+ ret
+.write2:
+ pshufb xm0, xm8
+ vmovdqu16 [rpq+xq]{k2}, xm0
+ add xq, 5*2
+ ret
+.write4:
+ vpermb ym0, ym8, ym0
+ vmovdqu32 [rpq+xq]{k2}, ym0
+ add xq, 5*4
+ ret
+.write8:
+ vpermb m0, m8, m0
+ vmovdqu64 [rpq+xq]{k2}, m0
+ add xq, 5*8
+ ret
+.write16:
+ vpermb m1, m8, m0
+ movu [rpq+xq+ 0], m1
+ pshufb xm0, xm9
+ movu [rpq+xq+64], xm0
+ add xq, 5*16
+ ret
+
+INIT_ZMM avx512icl
+cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
+ vbroadcasti32x4 m0, [aq]
+ lea r1, [splat_mv_avx512icl_table]
+ tzcnt bw4d, bw4d
+ lea bx4d, [bx4q*3]
+ pshufb m0, [splat_mv_shuf]
+ movsxd bw4q, [r1+bw4q*4]
+ mov r6d, bh4m
+ add bw4q, r1
+ lea rrq, [rrq+r6*8]
+ mov r1d, 0x3f
+ neg r6
+ kmovb k1, r1d
+ jmp bw4q
+.w1:
+ mov r1, [rrq+r6*8]
+ vmovdqu16 [r1+bx4q*4]{k1}, xm0
+ inc r6
+ jl .w1
+ RET
+.w2:
+ mov r1, [rrq+r6*8]
+ vmovdqu32 [r1+bx4q*4]{k1}, ym0
+ inc r6
+ jl .w2
+ RET
+.w4:
+ mov r1, [rrq+r6*8]
+ vmovdqu64 [r1+bx4q*4]{k1}, m0
+ inc r6
+ jl .w4
+ RET
+.w8:
+ pshufd ym1, ym0, q1021
+.w8_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ movu [r1+bx4q*4+ 0], m0
+ mova [r1+bx4q*4+64], ym1
+ movu [r3+bx4q*4+ 0], m0
+ mova [r3+bx4q*4+64], ym1
+ add r6, 2
+ jl .w8_loop
+ RET
+.w16:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w16_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ mova [r1+bx4q*4+64*0], m0
+ mova [r1+bx4q*4+64*1], m1
+ mova [r1+bx4q*4+64*2], m2
+ mova [r3+bx4q*4+64*0], m0
+ mova [r3+bx4q*4+64*1], m1
+ mova [r3+bx4q*4+64*2], m2
+ add r6, 2
+ jl .w16_loop
+ RET
+.w32:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w32_loop:
+ mov r1, [rrq+r6*8]
+ lea r1, [r1+bx4q*4]
+ mova [r1+64*0], m0
+ mova [r1+64*1], m1
+ mova [r1+64*2], m2
+ mova [r1+64*3], m0
+ mova [r1+64*4], m1
+ mova [r1+64*5], m2
+ inc r6
+ jl .w32_loop
+ RET
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/refmvs.h b/third_party/dav1d/src/x86/refmvs.h
new file mode 100644
index 0000000000..9dafa78b13
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
+
+decl_splat_mv_fn(dav1d_splat_mv_sse2);
+decl_splat_mv_fn(dav1d_splat_mv_avx2);
+decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
+
+static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->splat_mv = dav1d_splat_mv_sse2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_ssse3;
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_avx2;
+ c->splat_mv = dav1d_splat_mv_avx2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_avx512icl;
+ c->splat_mv = dav1d_splat_mv_avx512icl;
+#endif
+}